DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
There are a few nuggets from within these beginning modules, including:
Below is some sample code showing examples for the generic statements:
library(ggplot2)
library(ggthemes)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Factors
xRaw = c("High", "High", "Low", "Low", "Medium", "Very High", "Low")
xFactorNon = factor(xRaw, levels=c("Low", "Medium", "High", "Very High"))
xFactorNon
## [1] High High Low Low Medium Very High Low
## Levels: Low Medium High Very High
xFactorNon[xFactorNon == "High"] > xFactorNon[xFactorNon == "Low"][1]
## Warning in Ops.factor(xFactorNon[xFactorNon == "High"],
## xFactorNon[xFactorNon == : '>' not meaningful for factors
## [1] NA NA
xFactorOrder = factor(xRaw, ordered=TRUE, levels=c("Low", "Medium", "High", "Very High"))
xFactorOrder
## [1] High High Low Low Medium Very High Low
## Levels: Low < Medium < High < Very High
xFactorOrder[xFactorOrder == "High"] > xFactorOrder[xFactorOrder == "Low"][1]
## [1] TRUE TRUE
# Subsets
data(mtcars)
subset(mtcars, mpg>=25)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
identical(subset(mtcars, mpg>=25), mtcars[mtcars$mpg>=25, ])
## [1] TRUE
subset(mtcars, mpg>25, select=c("mpg", "cyl", "disp"))
## mpg cyl disp
## Fiat 128 32.4 4 78.7
## Honda Civic 30.4 4 75.7
## Toyota Corolla 33.9 4 71.1
## Fiat X1-9 27.3 4 79.0
## Porsche 914-2 26.0 4 120.3
## Lotus Europa 30.4 4 95.1
# & and && (same as | and ||)
compA <- c(2, 3, 4, 1, 2, 3)
compB <- c(1, 2, 3, 4, 5, 6)
(compA > compB) & (compA + compB < 6)
## [1] TRUE TRUE FALSE FALSE FALSE FALSE
(compA > compB) | (compA + compB < 6)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
(compA > compB) && (compA + compB < 6)
## [1] TRUE
(compA > compB) || (compA + compB < 6)
## [1] TRUE
# Loops and cat()
# for (a in b) {
# do stuff
# if (exitCond) { break }
# if (nextCond) { next }
# do some more stuff
# }
for (myVal in compA*compB) {
print(paste0("myVal is: ", myVal))
if ((myVal %% 3) == 0) { cat("Divisible by 3, not happy about that\n\n"); next }
print("That is not divisible by 3")
if ((myVal %% 5) == 0) { cat("Exiting due to divisible by 5 but not divisible by 3\n\n"); break }
cat("Onwards and upwards\n\n")
}
## [1] "myVal is: 2"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 6"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 12"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 4"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 10"
## [1] "That is not divisible by 3"
## Exiting due to divisible by 5 but not divisible by 3
# args() and search()
args(plot.default)
## function (x, y = NULL, type = "p", xlim = NULL, ylim = NULL,
## log = "", main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
## ann = par("ann"), axes = TRUE, frame.plot = axes, panel.first = NULL,
## panel.last = NULL, asp = NA, ...)
## NULL
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggthemes"
## [4] "package:ggplot2" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
# unique()
compA
## [1] 2 3 4 1 2 3
unique(compA)
## [1] 2 3 4 1
# unlist()
listA <- as.list(compA)
unlist(listA)
## [1] 2 3 4 1 2 3
identical(compA, unlist(listA))
## [1] TRUE
# sort()
sort(mtcars$mpg)
## [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1
## [15] 18.7 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3
## [29] 30.4 30.4 32.4 33.9
sort(mtcars$mpg, decreasing=TRUE)
## [1] 33.9 32.4 30.4 30.4 27.3 26.0 24.4 22.8 22.8 21.5 21.4 21.4 21.0 21.0
## [15] 19.7 19.2 19.2 18.7 18.1 17.8 17.3 16.4 15.8 15.5 15.2 15.2 15.0 14.7
## [29] 14.3 13.3 10.4 10.4
# rep()
rep(1:6, times=2) # 1:6 followed by 1:6
## [1] 1 2 3 4 5 6 1 2 3 4 5 6
rep(1:6, each=2) # 1 1 2 2 3 3 4 4 5 5 6 6
## [1] 1 1 2 2 3 3 4 4 5 5 6 6
rep(1:6, times=2, each=3) # 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 repeated twice (each comes first)
## [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6
## [36] 6
rep(1:6, times=6:1) # 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
## [1] 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
# append()
myWords <- c("The", "cat", "in", "the", "hat")
paste(append(myWords, c("is", "fun", "to", "read")), collapse=" ")
## [1] "The cat in the hat is fun to read"
paste(append(myWords, "funny", 4), collapse=" ")
## [1] "The cat in the funny hat"
# grep("//1")
sampMsg <- "This is from myname@subdomain.mydomain.com again"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\1", sampMsg)
## [1] "This is from myname@"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\2", sampMsg)
## [1] "subdomain.mydomain.com"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\3", sampMsg)
## [1] " again"
# rev()
compA
## [1] 2 3 4 1 2 3
rev(compA)
## [1] 3 2 1 4 3 2
Below is some sample code showing examples for the apply statements:
# lapply
args(lapply)
## function (X, FUN, ...)
## NULL
lapply(1:5, FUN=sqrt)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1.414214
##
## [[3]]
## [1] 1.732051
##
## [[4]]
## [1] 2
##
## [[5]]
## [1] 2.236068
lapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## x y pow
## 4 3 64
##
## [[5]]
## x y pow
## 5 3 125
lapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# sapply (defaults to returning a named vector/array if possible; is lapply otherwise)
args(sapply)
## function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
## NULL
args(simplify2array)
## function (x, higher = TRUE)
## NULL
sapply(1:5, FUN=sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
sapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
sapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# vapply (tells sapply exactly what should be returned; errors out otherwise)
args(vapply)
## function (X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
## NULL
vapply(1:5, FUN=sqrt, FUN.VALUE=numeric(1))
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
vapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, FUN.VALUE=numeric(3), y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
Below is some sample code for handing dates and times in R:
Sys.Date()
## [1] "2017-12-24"
Sys.time()
## [1] "2017-12-24 06:29:03 CST"
args(strptime)
## function (x, format, tz = "")
## NULL
rightNow <- as.POSIXct(Sys.time())
format(rightNow, "%Y**%M-%d %H hours and %M minutes", usetz=TRUE)
## [1] "2017**29-24 06 hours and 29 minutes CST"
lastChristmasNoon <- as.POSIXct("2015-12-25 12:00:00", format="%Y-%m-%d %X")
rightNow - lastChristmasNoon
## Time difference of 729.7702 days
nextUMHomeGame <- as.POSIXct("16/SEP/3 12:00:00", format="%y/%b/%d %H:%M:%S", tz="America/Detroit")
nextUMHomeGame - rightNow
## Time difference of -476.8535 days
# Time zones available in R
OlsonNames()
## [1] "Africa/Abidjan" "Africa/Accra"
## [3] "Africa/Addis_Ababa" "Africa/Algiers"
## [5] "Africa/Asmara" "Africa/Asmera"
## [7] "Africa/Bamako" "Africa/Bangui"
## [9] "Africa/Banjul" "Africa/Bissau"
## [11] "Africa/Blantyre" "Africa/Brazzaville"
## [13] "Africa/Bujumbura" "Africa/Cairo"
## [15] "Africa/Casablanca" "Africa/Ceuta"
## [17] "Africa/Conakry" "Africa/Dakar"
## [19] "Africa/Dar_es_Salaam" "Africa/Djibouti"
## [21] "Africa/Douala" "Africa/El_Aaiun"
## [23] "Africa/Freetown" "Africa/Gaborone"
## [25] "Africa/Harare" "Africa/Johannesburg"
## [27] "Africa/Juba" "Africa/Kampala"
## [29] "Africa/Khartoum" "Africa/Kigali"
## [31] "Africa/Kinshasa" "Africa/Lagos"
## [33] "Africa/Libreville" "Africa/Lome"
## [35] "Africa/Luanda" "Africa/Lubumbashi"
## [37] "Africa/Lusaka" "Africa/Malabo"
## [39] "Africa/Maputo" "Africa/Maseru"
## [41] "Africa/Mbabane" "Africa/Mogadishu"
## [43] "Africa/Monrovia" "Africa/Nairobi"
## [45] "Africa/Ndjamena" "Africa/Niamey"
## [47] "Africa/Nouakchott" "Africa/Ouagadougou"
## [49] "Africa/Porto-Novo" "Africa/Sao_Tome"
## [51] "Africa/Timbuktu" "Africa/Tripoli"
## [53] "Africa/Tunis" "Africa/Windhoek"
## [55] "America/Adak" "America/Anchorage"
## [57] "America/Anguilla" "America/Antigua"
## [59] "America/Araguaina" "America/Argentina/Buenos_Aires"
## [61] "America/Argentina/Catamarca" "America/Argentina/ComodRivadavia"
## [63] "America/Argentina/Cordoba" "America/Argentina/Jujuy"
## [65] "America/Argentina/La_Rioja" "America/Argentina/Mendoza"
## [67] "America/Argentina/Rio_Gallegos" "America/Argentina/Salta"
## [69] "America/Argentina/San_Juan" "America/Argentina/San_Luis"
## [71] "America/Argentina/Tucuman" "America/Argentina/Ushuaia"
## [73] "America/Aruba" "America/Asuncion"
## [75] "America/Atikokan" "America/Atka"
## [77] "America/Bahia" "America/Bahia_Banderas"
## [79] "America/Barbados" "America/Belem"
## [81] "America/Belize" "America/Blanc-Sablon"
## [83] "America/Boa_Vista" "America/Bogota"
## [85] "America/Boise" "America/Buenos_Aires"
## [87] "America/Cambridge_Bay" "America/Campo_Grande"
## [89] "America/Cancun" "America/Caracas"
## [91] "America/Catamarca" "America/Cayenne"
## [93] "America/Cayman" "America/Chicago"
## [95] "America/Chihuahua" "America/Coral_Harbour"
## [97] "America/Cordoba" "America/Costa_Rica"
## [99] "America/Creston" "America/Cuiaba"
## [101] "America/Curacao" "America/Danmarkshavn"
## [103] "America/Dawson" "America/Dawson_Creek"
## [105] "America/Denver" "America/Detroit"
## [107] "America/Dominica" "America/Edmonton"
## [109] "America/Eirunepe" "America/El_Salvador"
## [111] "America/Ensenada" "America/Fort_Nelson"
## [113] "America/Fort_Wayne" "America/Fortaleza"
## [115] "America/Glace_Bay" "America/Godthab"
## [117] "America/Goose_Bay" "America/Grand_Turk"
## [119] "America/Grenada" "America/Guadeloupe"
## [121] "America/Guatemala" "America/Guayaquil"
## [123] "America/Guyana" "America/Halifax"
## [125] "America/Havana" "America/Hermosillo"
## [127] "America/Indiana/Indianapolis" "America/Indiana/Knox"
## [129] "America/Indiana/Marengo" "America/Indiana/Petersburg"
## [131] "America/Indiana/Tell_City" "America/Indiana/Vevay"
## [133] "America/Indiana/Vincennes" "America/Indiana/Winamac"
## [135] "America/Indianapolis" "America/Inuvik"
## [137] "America/Iqaluit" "America/Jamaica"
## [139] "America/Jujuy" "America/Juneau"
## [141] "America/Kentucky/Louisville" "America/Kentucky/Monticello"
## [143] "America/Knox_IN" "America/Kralendijk"
## [145] "America/La_Paz" "America/Lima"
## [147] "America/Los_Angeles" "America/Louisville"
## [149] "America/Lower_Princes" "America/Maceio"
## [151] "America/Managua" "America/Manaus"
## [153] "America/Marigot" "America/Martinique"
## [155] "America/Matamoros" "America/Mazatlan"
## [157] "America/Mendoza" "America/Menominee"
## [159] "America/Merida" "America/Metlakatla"
## [161] "America/Mexico_City" "America/Miquelon"
## [163] "America/Moncton" "America/Monterrey"
## [165] "America/Montevideo" "America/Montreal"
## [167] "America/Montserrat" "America/Nassau"
## [169] "America/New_York" "America/Nipigon"
## [171] "America/Nome" "America/Noronha"
## [173] "America/North_Dakota/Beulah" "America/North_Dakota/Center"
## [175] "America/North_Dakota/New_Salem" "America/Ojinaga"
## [177] "America/Panama" "America/Pangnirtung"
## [179] "America/Paramaribo" "America/Phoenix"
## [181] "America/Port-au-Prince" "America/Port_of_Spain"
## [183] "America/Porto_Acre" "America/Porto_Velho"
## [185] "America/Puerto_Rico" "America/Rainy_River"
## [187] "America/Rankin_Inlet" "America/Recife"
## [189] "America/Regina" "America/Resolute"
## [191] "America/Rio_Branco" "America/Rosario"
## [193] "America/Santa_Isabel" "America/Santarem"
## [195] "America/Santiago" "America/Santo_Domingo"
## [197] "America/Sao_Paulo" "America/Scoresbysund"
## [199] "America/Shiprock" "America/Sitka"
## [201] "America/St_Barthelemy" "America/St_Johns"
## [203] "America/St_Kitts" "America/St_Lucia"
## [205] "America/St_Thomas" "America/St_Vincent"
## [207] "America/Swift_Current" "America/Tegucigalpa"
## [209] "America/Thule" "America/Thunder_Bay"
## [211] "America/Tijuana" "America/Toronto"
## [213] "America/Tortola" "America/Vancouver"
## [215] "America/Virgin" "America/Whitehorse"
## [217] "America/Winnipeg" "America/Yakutat"
## [219] "America/Yellowknife" "Antarctica/Casey"
## [221] "Antarctica/Davis" "Antarctica/DumontDUrville"
## [223] "Antarctica/Macquarie" "Antarctica/Mawson"
## [225] "Antarctica/McMurdo" "Antarctica/Palmer"
## [227] "Antarctica/Rothera" "Antarctica/South_Pole"
## [229] "Antarctica/Syowa" "Antarctica/Troll"
## [231] "Antarctica/Vostok" "Arctic/Longyearbyen"
## [233] "Asia/Aden" "Asia/Almaty"
## [235] "Asia/Amman" "Asia/Anadyr"
## [237] "Asia/Aqtau" "Asia/Aqtobe"
## [239] "Asia/Ashgabat" "Asia/Ashkhabad"
## [241] "Asia/Atyrau" "Asia/Baghdad"
## [243] "Asia/Bahrain" "Asia/Baku"
## [245] "Asia/Bangkok" "Asia/Barnaul"
## [247] "Asia/Beirut" "Asia/Bishkek"
## [249] "Asia/Brunei" "Asia/Calcutta"
## [251] "Asia/Chita" "Asia/Choibalsan"
## [253] "Asia/Chongqing" "Asia/Chungking"
## [255] "Asia/Colombo" "Asia/Dacca"
## [257] "Asia/Damascus" "Asia/Dhaka"
## [259] "Asia/Dili" "Asia/Dubai"
## [261] "Asia/Dushanbe" "Asia/Famagusta"
## [263] "Asia/Gaza" "Asia/Harbin"
## [265] "Asia/Hebron" "Asia/Ho_Chi_Minh"
## [267] "Asia/Hong_Kong" "Asia/Hovd"
## [269] "Asia/Irkutsk" "Asia/Istanbul"
## [271] "Asia/Jakarta" "Asia/Jayapura"
## [273] "Asia/Jerusalem" "Asia/Kabul"
## [275] "Asia/Kamchatka" "Asia/Karachi"
## [277] "Asia/Kashgar" "Asia/Kathmandu"
## [279] "Asia/Katmandu" "Asia/Khandyga"
## [281] "Asia/Kolkata" "Asia/Krasnoyarsk"
## [283] "Asia/Kuala_Lumpur" "Asia/Kuching"
## [285] "Asia/Kuwait" "Asia/Macao"
## [287] "Asia/Macau" "Asia/Magadan"
## [289] "Asia/Makassar" "Asia/Manila"
## [291] "Asia/Muscat" "Asia/Nicosia"
## [293] "Asia/Novokuznetsk" "Asia/Novosibirsk"
## [295] "Asia/Omsk" "Asia/Oral"
## [297] "Asia/Phnom_Penh" "Asia/Pontianak"
## [299] "Asia/Pyongyang" "Asia/Qatar"
## [301] "Asia/Qyzylorda" "Asia/Rangoon"
## [303] "Asia/Riyadh" "Asia/Saigon"
## [305] "Asia/Sakhalin" "Asia/Samarkand"
## [307] "Asia/Seoul" "Asia/Shanghai"
## [309] "Asia/Singapore" "Asia/Srednekolymsk"
## [311] "Asia/Taipei" "Asia/Tashkent"
## [313] "Asia/Tbilisi" "Asia/Tehran"
## [315] "Asia/Tel_Aviv" "Asia/Thimbu"
## [317] "Asia/Thimphu" "Asia/Tokyo"
## [319] "Asia/Tomsk" "Asia/Ujung_Pandang"
## [321] "Asia/Ulaanbaatar" "Asia/Ulan_Bator"
## [323] "Asia/Urumqi" "Asia/Ust-Nera"
## [325] "Asia/Vientiane" "Asia/Vladivostok"
## [327] "Asia/Yakutsk" "Asia/Yangon"
## [329] "Asia/Yekaterinburg" "Asia/Yerevan"
## [331] "Atlantic/Azores" "Atlantic/Bermuda"
## [333] "Atlantic/Canary" "Atlantic/Cape_Verde"
## [335] "Atlantic/Faeroe" "Atlantic/Faroe"
## [337] "Atlantic/Jan_Mayen" "Atlantic/Madeira"
## [339] "Atlantic/Reykjavik" "Atlantic/South_Georgia"
## [341] "Atlantic/St_Helena" "Atlantic/Stanley"
## [343] "Australia/ACT" "Australia/Adelaide"
## [345] "Australia/Brisbane" "Australia/Broken_Hill"
## [347] "Australia/Canberra" "Australia/Currie"
## [349] "Australia/Darwin" "Australia/Eucla"
## [351] "Australia/Hobart" "Australia/LHI"
## [353] "Australia/Lindeman" "Australia/Lord_Howe"
## [355] "Australia/Melbourne" "Australia/North"
## [357] "Australia/NSW" "Australia/Perth"
## [359] "Australia/Queensland" "Australia/South"
## [361] "Australia/Sydney" "Australia/Tasmania"
## [363] "Australia/Victoria" "Australia/West"
## [365] "Australia/Yancowinna" "Brazil/Acre"
## [367] "Brazil/DeNoronha" "Brazil/East"
## [369] "Brazil/West" "Canada/Atlantic"
## [371] "Canada/Central" "Canada/East-Saskatchewan"
## [373] "Canada/Eastern" "Canada/Mountain"
## [375] "Canada/Newfoundland" "Canada/Pacific"
## [377] "Canada/Saskatchewan" "Canada/Yukon"
## [379] "CET" "Chile/Continental"
## [381] "Chile/EasterIsland" "CST6CDT"
## [383] "Cuba" "EET"
## [385] "Egypt" "Eire"
## [387] "EST" "EST5EDT"
## [389] "Etc/GMT" "Etc/GMT-0"
## [391] "Etc/GMT-1" "Etc/GMT-10"
## [393] "Etc/GMT-11" "Etc/GMT-12"
## [395] "Etc/GMT-13" "Etc/GMT-14"
## [397] "Etc/GMT-2" "Etc/GMT-3"
## [399] "Etc/GMT-4" "Etc/GMT-5"
## [401] "Etc/GMT-6" "Etc/GMT-7"
## [403] "Etc/GMT-8" "Etc/GMT-9"
## [405] "Etc/GMT+0" "Etc/GMT+1"
## [407] "Etc/GMT+10" "Etc/GMT+11"
## [409] "Etc/GMT+12" "Etc/GMT+2"
## [411] "Etc/GMT+3" "Etc/GMT+4"
## [413] "Etc/GMT+5" "Etc/GMT+6"
## [415] "Etc/GMT+7" "Etc/GMT+8"
## [417] "Etc/GMT+9" "Etc/GMT0"
## [419] "Etc/Greenwich" "Etc/UCT"
## [421] "Etc/Universal" "Etc/UTC"
## [423] "Etc/Zulu" "Europe/Amsterdam"
## [425] "Europe/Andorra" "Europe/Astrakhan"
## [427] "Europe/Athens" "Europe/Belfast"
## [429] "Europe/Belgrade" "Europe/Berlin"
## [431] "Europe/Bratislava" "Europe/Brussels"
## [433] "Europe/Bucharest" "Europe/Budapest"
## [435] "Europe/Busingen" "Europe/Chisinau"
## [437] "Europe/Copenhagen" "Europe/Dublin"
## [439] "Europe/Gibraltar" "Europe/Guernsey"
## [441] "Europe/Helsinki" "Europe/Isle_of_Man"
## [443] "Europe/Istanbul" "Europe/Jersey"
## [445] "Europe/Kaliningrad" "Europe/Kiev"
## [447] "Europe/Kirov" "Europe/Lisbon"
## [449] "Europe/Ljubljana" "Europe/London"
## [451] "Europe/Luxembourg" "Europe/Madrid"
## [453] "Europe/Malta" "Europe/Mariehamn"
## [455] "Europe/Minsk" "Europe/Monaco"
## [457] "Europe/Moscow" "Europe/Nicosia"
## [459] "Europe/Oslo" "Europe/Paris"
## [461] "Europe/Podgorica" "Europe/Prague"
## [463] "Europe/Riga" "Europe/Rome"
## [465] "Europe/Samara" "Europe/San_Marino"
## [467] "Europe/Sarajevo" "Europe/Saratov"
## [469] "Europe/Simferopol" "Europe/Skopje"
## [471] "Europe/Sofia" "Europe/Stockholm"
## [473] "Europe/Tallinn" "Europe/Tirane"
## [475] "Europe/Tiraspol" "Europe/Ulyanovsk"
## [477] "Europe/Uzhgorod" "Europe/Vaduz"
## [479] "Europe/Vatican" "Europe/Vienna"
## [481] "Europe/Vilnius" "Europe/Volgograd"
## [483] "Europe/Warsaw" "Europe/Zagreb"
## [485] "Europe/Zaporozhye" "Europe/Zurich"
## [487] "GB" "GB-Eire"
## [489] "GMT" "GMT-0"
## [491] "GMT+0" "GMT0"
## [493] "Greenwich" "Hongkong"
## [495] "HST" "Iceland"
## [497] "Indian/Antananarivo" "Indian/Chagos"
## [499] "Indian/Christmas" "Indian/Cocos"
## [501] "Indian/Comoro" "Indian/Kerguelen"
## [503] "Indian/Mahe" "Indian/Maldives"
## [505] "Indian/Mauritius" "Indian/Mayotte"
## [507] "Indian/Reunion" "Iran"
## [509] "Israel" "Jamaica"
## [511] "Japan" "Kwajalein"
## [513] "Libya" "MET"
## [515] "Mexico/BajaNorte" "Mexico/BajaSur"
## [517] "Mexico/General" "MST"
## [519] "MST7MDT" "Navajo"
## [521] "NZ" "NZ-CHAT"
## [523] "Pacific/Apia" "Pacific/Auckland"
## [525] "Pacific/Bougainville" "Pacific/Chatham"
## [527] "Pacific/Chuuk" "Pacific/Easter"
## [529] "Pacific/Efate" "Pacific/Enderbury"
## [531] "Pacific/Fakaofo" "Pacific/Fiji"
## [533] "Pacific/Funafuti" "Pacific/Galapagos"
## [535] "Pacific/Gambier" "Pacific/Guadalcanal"
## [537] "Pacific/Guam" "Pacific/Honolulu"
## [539] "Pacific/Johnston" "Pacific/Kiritimati"
## [541] "Pacific/Kosrae" "Pacific/Kwajalein"
## [543] "Pacific/Majuro" "Pacific/Marquesas"
## [545] "Pacific/Midway" "Pacific/Nauru"
## [547] "Pacific/Niue" "Pacific/Norfolk"
## [549] "Pacific/Noumea" "Pacific/Pago_Pago"
## [551] "Pacific/Palau" "Pacific/Pitcairn"
## [553] "Pacific/Pohnpei" "Pacific/Ponape"
## [555] "Pacific/Port_Moresby" "Pacific/Rarotonga"
## [557] "Pacific/Saipan" "Pacific/Samoa"
## [559] "Pacific/Tahiti" "Pacific/Tarawa"
## [561] "Pacific/Tongatapu" "Pacific/Truk"
## [563] "Pacific/Wake" "Pacific/Wallis"
## [565] "Pacific/Yap" "Poland"
## [567] "Portugal" "PRC"
## [569] "PST8PDT" "ROC"
## [571] "ROK" "Singapore"
## [573] "Turkey" "UCT"
## [575] "Universal" "US/Alaska"
## [577] "US/Aleutian" "US/Arizona"
## [579] "US/Central" "US/East-Indiana"
## [581] "US/Eastern" "US/Hawaii"
## [583] "US/Indiana-Starke" "US/Michigan"
## [585] "US/Mountain" "US/Pacific"
## [587] "US/Pacific-New" "US/Samoa"
## [589] "UTC" "VERSION"
## [591] "W-SU" "WET"
## [593] "Zulu"
# From ?strptime (excerpted)
#
# ** General formats **
# %c Date and time. Locale-specific on output, "%a %b %e %H:%M:%S %Y" on input.
# %F Equivalent to %Y-%m-%d (the ISO 8601 date format).
# %T Equivalent to %H:%M:%S.
# %D Date format such as %m/%d/%y: the C99 standard says it should be that exact format
# %x Date. Locale-specific on output, "%y/%m/%d" on input.
# %X Time. Locale-specific on output, "%H:%M:%S" on input.
#
# ** Key Components **
# %y Year without century (00-99). On input, values 00 to 68 are prefixed by 20 and 69 to 99 by 19
# %Y Year with century
# %m Month as decimal number (01-12).
# %b Abbreviated month name in the current locale on this platform.
# %B Full month name in the current locale.
# %d Day of the month as decimal number (01-31).
# %e Day of the month as decimal number (1-31), with a leading space for a single-digit number.
# %a Abbreviated weekday name in the current locale on this platform.
# %A Full weekday name in the current locale.
# %H Hours as decimal number (00-23)
# %I Hours as decimal number (01-12)
# %M Minute as decimal number (00-59).
# %S Second as integer (00-61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds).
#
# ** Additional Options **
# %C Century (00-99): the integer part of the year divided by 100.
#
# %g The last two digits of the week-based year (see %V). (Accepted but ignored on input.)
# %G The week-based year (see %V) as a decimal number. (Accepted but ignored on input.)
#
# %h Equivalent to %b.
#
# %j Day of year as decimal number (001-366).
#
# %n Newline on output, arbitrary whitespace on input.
#
# %p AM/PM indicator in the locale. Used in conjunction with %I and not with %H. An empty string in some locales (and the behaviour is undefined if used for input in such a locale). Some platforms accept %P for output, which uses a lower-case version: others will output P.
#
# %r The 12-hour clock time (using the locale's AM or PM). Only defined in some locales.
#
# %R Equivalent to %H:%M.
#
# %t Tab on output, arbitrary whitespace on input.
#
# %u Weekday as a decimal number (1-7, Monday is 1).
#
# %U Week of the year as decimal number (00-53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention.
#
# %V Week of the year as decimal number (01-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1. (Accepted but ignored on input.)
#
# %w Weekday as decimal number (0-6, Sunday is 0).
#
# %W Week of the year as decimal number (00-53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention.
#
# For input, only years 0:9999 are accepted.
#
# %z Signed offset in hours and minutes from UTC, so -0800 is 8 hours behind UTC. Values up to +1400 are accepted as from R 3.1.1: previous versions only accepted up to +1200. (Standard only for output.)
#
# %Z (Output only.) Time zone abbreviation as a character string (empty if not available). This may not be reliable when a time zone has changed abbreviations over the years.
Additionally, code from several practice examples is added:
set.seed(1608221310)
me <- 89
other_199 <- round(rnorm(199, mean=75.45, sd=11.03), 0)
mean(other_199)
## [1] 75.17588
sd(other_199)
## [1] 11.37711
desMeans <- c(72.275, 76.24, 74.5, 77.695)
desSD <- c(12.31, 11.22, 12.5, 12.53)
prevData <- c(rnorm(200, mean=72.275, sd=12.31),
rnorm(200, mean=76.24, sd=11.22),
rnorm(200, mean=74.5, sd=12.5),
rnorm(200, mean=77.695, sd=12.53)
)
previous_4 <- matrix(data=prevData, ncol=4)
curMeans <- apply(previous_4, 2, FUN=mean)
curSD <- apply(previous_4, 2, FUN=sd)
previous_4 <- t(apply(previous_4, 1, FUN=function(x) { desMeans + (desSD / curSD) * (x - curMeans) } ))
apply(round(previous_4, 0), 2, FUN=mean)
## [1] 72.285 76.245 74.505 77.665
apply(round(previous_4, 0), 2, FUN=sd)
## [1] 12.35097 11.19202 12.49643 12.51744
previous_4 <- round(previous_4, 0)
# Merge me and other_199: my_class
my_class <- c(me, other_199)
# cbind() my_class and previous_4: last_5
last_5 <- cbind(my_class, previous_4)
# Name last_5 appropriately
nms <- paste0("year_", 1:5)
colnames(last_5) <- nms
# Build histogram of my_class
hist(my_class)
# Generate summary of last_5
summary(last_5)
## year_1 year_2 year_3 year_4
## Min. : 46.00 Min. : 43.00 Min. : 38.00 Min. : 42.00
## 1st Qu.: 68.00 1st Qu.: 63.75 1st Qu.: 69.00 1st Qu.: 65.75
## Median : 75.50 Median : 73.00 Median : 76.50 Median : 74.00
## Mean : 75.25 Mean : 72.28 Mean : 76.25 Mean : 74.50
## 3rd Qu.: 83.25 3rd Qu.: 81.00 3rd Qu.: 84.25 3rd Qu.: 82.25
## Max. :108.00 Max. :108.00 Max. :102.00 Max. :113.00
## year_5
## Min. : 38.00
## 1st Qu.: 71.00
## Median : 78.00
## Mean : 77.67
## 3rd Qu.: 86.00
## Max. :117.00
# Build boxplot of last_5
boxplot(last_5)
# How many grades in your class are higher than 75?
sum(my_class > 75)
## [1] 100
# How many students in your class scored strictly higher than you?
sum(my_class > me)
## [1] 17
# What's the proportion of grades below or equal to 64 in the last 5 years?
mean(last_5 <= 64)
## [1] 0.191
# Is your grade greater than 87 and smaller than or equal to 89?
me > 87 & me <= 89
## [1] TRUE
# Which grades in your class are below 60 or above 90?
my_class < 60 | my_class > 90
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [23] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [34] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [45] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [56] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [78] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [89] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [100] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [133] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [155] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [166] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [199] FALSE FALSE
# What's the proportion of grades in your class that is average?
mean(my_class >= 70 & my_class <= 85)
## [1] 0.525
# How many students in the last 5 years had a grade of 80 or 90?
sum(last_5 %in% c(80, 90))
## [1] 44
# Define n_smart
n_smart <- sum(my_class >= 80)
# Code the if-else construct
if (n_smart > 50) {
print("smart class")
} else {
print("rather average")
}
## [1] "smart class"
# Define prop_less
prop_less <- mean(my_class < me)
# Code the control construct
if (prop_less > 0.9) {
print("you're among the best 10 percent")
} else if (prop_less > 0.8) {
print("you're among the best 20 percent")
} else {
print("need more analysis")
}
## [1] "you're among the best 20 percent"
# Embedded control structure: fix the error
if (mean(my_class) < 75) {
if (mean(my_class) > me) {
print("average year, but still smarter than me")
} else {
print("average year, but I'm not that bad")
}
} else {
if (mean(my_class) > me) {
print("smart year, even smarter than me")
} else {
print("smart year, but I am smarter")
}
}
## [1] "smart year, but I am smarter"
# Create top_grades
top_grades <- my_class[my_class >= 85]
# Create worst_grades
worst_grades <- my_class[my_class < 65]
# Write conditional statement
if (length(top_grades) > length(worst_grades)) { print("top grades prevail") }
## [1] "top grades prevail"
Hadley and Charlotte Wickham led a course on writing functions in R. Broadly, the course includes advice on when/how to use functions, as well as specific advice about commands available through library(purrr).
Key pieces of advice include:
John Chambers gave a few useful slogans about functions:
Each function has three components:
Only the LAST evaluated expression is returned. The use of return() is recommended only for early-returns in a special case (for example, when a break() will be called).
Further, functions can be written anonymously on the command line, such as (function (x) {x + 1}) (1:5). A function should only depend on arguments passed to it, not variables from a parent enviornment. Every time the function is called, it receives a clean working environment. Once it finishes, its variables are no longer available unless they were returned (either by default as the last operation, or by way of return()):
# Components of a function
args(rnorm)
## function (n, mean = 0, sd = 1)
## NULL
formals(rnorm)
## $n
##
##
## $mean
## [1] 0
##
## $sd
## [1] 1
body(rnorm)
## .Call(C_rnorm, n, mean, sd)
environment(rnorm)
## <environment: namespace:stats>
# What is passed back
funDummy <- function(x) {
if (x <= 2) {
print("That is too small")
return(3) # This ends the function by convention
}
ceiling(x) # This is the defaulted return() value if nothing happened to prevent the code getting here
}
funDummy(1)
## [1] "That is too small"
## [1] 3
funDummy(5)
## [1] 5
# Anonymous functions
(function (x) {x + 1}) (1:5)
## [1] 2 3 4 5 6
The course includes some insightful discussion of vectors. As it happens, lists and data frames are just special collections of vectors in R. Each column of a data frame is a vector, while each element of a list is either 1) an embedded data frame (which is eventually a vector by way of columns), 2) an embedded list (which is eventually a vector by way of recursion), or 3) an actual vector.
The atomic vectors are of types logical, integer, character, and double; complex and raw are rarer types that are also available. Lists are just recursive vectors, which is to say that lists can contain other lists and can be hetergeneous. To explore vectors, you have:
Note that NULL is the absence of a vector and has length 0. NA is the absence of an element in the vector and has length 1. All math operations with NA return NA; for example NA == NA will return NA.
There are some good tips on extracting element from a list:
# Data types
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
typeof(mtcars) # n.b. that this is technically a "list"
## [1] "list"
length(mtcars)
## [1] 11
# NULL and NA
length(NULL)
## [1] 0
typeof(NULL)
## [1] "NULL"
length(NA)
## [1] 1
typeof(NA)
## [1] "logical"
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NA == NA
## [1] NA
is.null(NULL)
## [1] TRUE
is.null(NA)
## [1] FALSE
is.na(NULL)
## Warning in is.na(NULL): is.na() applied to non-(list or vector) of type
## 'NULL'
## logical(0)
is.na(NA)
## [1] TRUE
# Extraction
mtcars[["mpg"]][1:5]
## [1] 21.0 21.0 22.8 21.4 18.7
mtcars[[2]][1:5]
## [1] 6 6 4 6 8
mtcars$hp[1:5]
## [1] 110 110 93 110 175
# Relevant lengths
seq_along(mtcars)
## [1] 1 2 3 4 5 6 7 8 9 10 11
x <- data.frame()
seq_along(x)
## integer(0)
length(seq_along(x))
## [1] 0
foo <- function(x) { for (eachCol in seq_along(x)) { print(typeof(x[[eachCol]])) }}
foo(mtcars)
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
foo(x) # Note that this does nothing!
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
foo(airquality)
## [1] "integer"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "integer"
## [1] "integer"
# Range command
mpgRange <- range(mtcars$mpg)
mpgRange
## [1] 10.4 33.9
mpgScale <- (mtcars$mpg - mpgRange[1]) / (mpgRange[2] - mpgRange[1])
summary(mpgScale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2138 0.3745 0.4124 0.5277 1.0000
The typical arguments in a function use a consistent, simple naming function:
Data arguments should come before detail arguments, and detail arguments should be given reasonable default values. See for example rnorm(n, mean=0, sd=1). The number requested (n) must be specified, but defaults are available for the details (mean and standard deviation).
Functions can be passed as arguments to other functions, which is at the core of functional programming. For example:
do_math <- function(x, fun) { fun(x) }
do_math(1:10, fun=mean)
## [1] 5.5
do_math(1:10, fun=sd)
## [1] 3.02765
The library(purrr) takes advantage of this, and in a type-consistent manner. There are functions for:
The general arguments are .x (a list or an atomic vector) and .f which can be either a function, an anonymous function (formula with ~), or an extractor .x[[.f]]. For example:
library(purrr)
##
## Attaching package: 'purrr'
## The following objects are masked from 'package:dplyr':
##
## contains, order_by
library(RColorBrewer) # Need to have in non-cached chunk for later
data(mtcars)
# Create output as a list
map(.x=mtcars, .f=sum)
## $mpg
## [1] 642.9
##
## $cyl
## [1] 198
##
## $disp
## [1] 7383.1
##
## $hp
## [1] 4694
##
## $drat
## [1] 115.09
##
## $wt
## [1] 102.952
##
## $qsec
## [1] 571.16
##
## $vs
## [1] 14
##
## $am
## [1] 13
##
## $gear
## [1] 118
##
## $carb
## [1] 90
# Create same output as a double
map_dbl(.x=mtcars, .f=sum)
## mpg cyl disp hp drat wt qsec vs
## 642.900 198.000 7383.100 4694.000 115.090 102.952 571.160 14.000
## am gear carb
## 13.000 118.000 90.000
# Create same output as integer
# map_int(.x=mtcars, .f=sum) . . . this would bomb since it is not actually an integere
map_int(.x=mtcars, .f=function(x) { as.integer(round(sum(x), 0)) } )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Same thing but using an anonymous function with ~ and .
map_int(.x=mtcars, .f = ~ as.integer(round(sum(.), 0)) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Create a boolean vector
map_lgl(.x=mtcars, .f = ~ ifelse(sum(.) > 200, TRUE, FALSE) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# Create a character vector
map_chr(.x=mtcars, .f = ~ ifelse(sum(.) > 200, "Large", "Not So Large") )
## mpg cyl disp hp drat
## "Large" "Not So Large" "Large" "Large" "Not So Large"
## wt qsec vs am gear
## "Not So Large" "Large" "Not So Large" "Not So Large" "Not So Large"
## carb
## "Not So Large"
# Use the extractor [pulls the first row]
map_dbl(.x=mtcars, .f=1)
## mpg cyl disp hp drat wt qsec vs am gear
## 21.00 6.00 160.00 110.00 3.90 2.62 16.46 0.00 1.00 4.00
## carb
## 4.00
# Example from help file using chaining
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Using sapply
sapply(split(mtcars, mtcars$cyl), FUN=function(.x) { summary(lm(mpg ~ wt, data=.x))$r.squared } )
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Use the extractor from a list
cylSplit <- split(mtcars, mtcars$cyl)
map(cylSplit, "mpg")
## $`4`
## [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
##
## $`6`
## [1] 21.0 21.0 21.4 18.1 19.2 17.8 19.7
##
## $`8`
## [1] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 13.3 19.2 15.8 15.0
map(cylSplit, "cyl")
## $`4`
## [1] 4 4 4 4 4 4 4 4 4 4 4
##
## $`6`
## [1] 6 6 6 6 6 6 6
##
## $`8`
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8
The purrr library has several additional interesting functions:
Some example code includes:
library(purrr) # Called again for clarity; all these key functions belong to purrr
# safely(.f, otherwise = NULL, quiet = TRUE)
safe_log10 <- safely(log10)
map(list(0, 1, 10, "a"), .f=safe_log10)
## [[1]]
## [[1]]$result
## [1] -Inf
##
## [[1]]$error
## NULL
##
##
## [[2]]
## [[2]]$result
## [1] 0
##
## [[2]]$error
## NULL
##
##
## [[3]]
## [[3]]$result
## [1] 1
##
## [[3]]$error
## NULL
##
##
## [[4]]
## [[4]]$result
## NULL
##
## [[4]]$error
## <simpleError in .f(...): non-numeric argument to mathematical function>
# possibly(.f, otherwise, quiet = TRUE)
poss_log10 <- possibly(log10, otherwise=NaN)
map_dbl(list(0, 1, 10, "a"), .f=poss_log10)
## [1] -Inf 0 1 NaN
# transpose() - note that this can become masked by data.table::transpose() so be careful
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))
## $result
## $result[[1]]
## [1] -Inf
##
## $result[[2]]
## [1] 0
##
## $result[[3]]
## [1] 1
##
## $result[[4]]
## NULL
##
##
## $error
## $error[[1]]
## NULL
##
## $error[[2]]
## NULL
##
## $error[[3]]
## NULL
##
## $error[[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result
## [[1]]
## [1] -Inf
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 1
##
## [[4]]
## NULL
unlist(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result)
## [1] -Inf 0 1
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
map_lgl(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error, is.null)
## [1] TRUE TRUE TRUE FALSE
# map2(.x, .y, .f)
map2(list(5, 10, 20), list(1, 2, 3), .f=rnorm) # rnorm(5, 1), rnorm(10, 2), and rnorm(20, 3)
## [[1]]
## [1] 0.41176421 2.00652288 0.06152025 0.46963873 1.15436157
##
## [[2]]
## [1] 0.006821057 2.902712636 1.436150816 1.377836302 2.625075832
## [6] 0.680797806 0.313499192 0.718062969 2.820989906 3.134207742
##
## [[3]]
## [1] 3.3716474 2.9393673 1.8648940 3.2343343 2.1849894 2.0697179 1.0872014
## [8] 3.4970403 3.5769694 3.0999340 1.2033341 0.9839011 2.9820314 1.7116383
## [15] 0.8779558 1.6990118 2.5914013 2.3587803 3.7460957 1.2980312
# pmap(.l, .f)
pmap(list(n=list(5, 10, 20), mean=list(1, 5, 10), sd=list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 1.0151570 1.1573287 1.0628581 0.8805484 0.9418430
##
## [[2]]
## [1] 5.032920 4.689799 5.423525 5.265610 4.727383 5.252325 5.166292
## [8] 4.861745 5.135408 4.106679
##
## [[3]]
## [1] 9.854138 10.090939 10.045554 9.970755 10.092487 9.769531 10.140064
## [8] 9.834716 10.196817 10.047367 10.054093 10.006439 10.142002 10.092259
## [15] 10.222459 10.082440 10.067818 9.993884 10.078380 9.936942
# invoke_map(.f, .x, ...)
invoke_map(list(rnorm, runif, rexp), n=5)
## [[1]]
## [1] -0.96707137 0.08207476 1.39498168 0.60287972 -0.15130461
##
## [[2]]
## [1] 0.01087442 0.02980483 0.81443586 0.88438198 0.67976034
##
## [[3]]
## [1] 0.2646751 1.3233260 1.1079261 1.3504952 0.6795524
# walk() is for the side effects of a function
x <- list(1, "\n\ta\n", 3)
x %>% walk(cat)
## 1
## a
## 3
# Chaining is available by way of the %>% operator
pretty_titles <- c("N(0, 1)", "Uniform(0, 1)", "Exponential (rate=1)")
set.seed(1607120947)
x <- invoke_map(list(rnorm, runif, rexp), n=5000)
foo <- function(x) { map(x, .f=summary) }
par(mfrow=c(1, 3))
pwalk(list(x=x, main=pretty_titles), .f=hist, xlab="", col="light blue") %>% map(.f=foo)
## $x
## $x[[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.711000 -0.637800 -0.000217 0.006543 0.671800 3.633000
##
## $x[[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001241 0.2518000 0.5012000 0.5028000 0.7566000 0.9999000
##
## $x[[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00001 0.29140 0.68340 0.98260 1.37900 8.46300
##
##
## $main
## $main[[1]]
## Length Class Mode
## 1 character character
##
## $main[[2]]
## Length Class Mode
## 1 character character
##
## $main[[3]]
## Length Class Mode
## 1 character character
par(mfrow=c(1, 1))
There are two potentially desirable behaviors with functions:
As a best practice, R functions that will be used for programming (as opposed to interactive command line work) should be written in a robust manner. Three standard problems should be avoided/mitigated:
There are several methods available for throwing errors within an R function:
One example that commonly creates surprises is the [,] operator for extraction. Adding [ , , drop=FALSE] ensures that you will still have what you passed (e.g., a matrix or data frame) rather than conversion of a chunk of data to a vector.
Another common source of error is sapply() which will return a vector when it can and a list otherwise. The map() and map_typ() functions in purrr are designed to be type-stable; if the output is not as expected, they will error out.
Non-standard evaluations take advantage of the existence of something else (e.g., a variable in the parent environment that has not been passed). This can cause confusion and improper results.
Pure functions have the key properties that 1) their output depends only on their inputs, and 2) they do not impact the outside world other than by way of their return value. Specifically, the function should not depend on how the user has configured their global options as shown in options(), nor should it modify those options() settings upon return of control to the parent environment.
A few examples are shown below:
# Throwing errors to stop a function (cannot actually run these!)
# stopifnot(FALSE)
# if (FALSE) { stop("Error: ", call.=FALSE) }
# if (FALSE) { stop("Error: This condition needed to be set as TRUE", call.=FALSE) }
# Behavior of [,] and [,,drop=FALSE]
mtxTest <- matrix(data=1:9, nrow=3, byrow=TRUE)
class(mtxTest)
## [1] "matrix"
mtxTest[1, ]
## [1] 1 2 3
class(mtxTest[1, ])
## [1] "integer"
mtxTest[1, , drop=FALSE]
## [,1] [,2] [,3]
## [1,] 1 2 3
class(mtxTest[1, , drop=FALSE])
## [1] "matrix"
# Behavior of sapply() - may not get what you are expecting
foo <- function(x) { x^2 }
sapply(1:5, FUN=foo)
## [1] 1 4 9 16 25
class(sapply(1:5, FUN=foo))
## [1] "numeric"
sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [1] 1.00 2.25 4.00 6.25 9.00 16.00 25.00
class(sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "numeric"
sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2.25 4.00 6.25
##
## [[3]]
## [1] 9
##
## [[4]]
## [1] 16
##
## [[5]]
## [1] 25
class(sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "list"
This was a very enjoyable and instructive course.
Chapter 1 - Introduction to Object Oriented Programming (OOP)
Typical R usage involves a functional programming style - data to function to new data to new function to newer values and etc. Object Oriented Programming (OOP) instead involves thinking about the data structures (objects), their functionalities, and the like:
There are nine different options for OOP in R:
How does R distinguish types of variables?
Assigning Classes and Implicit Classes:
Example code includes:
# Create these variables
a_numeric_vector <- rlnorm(50)
a_factor <- factor(
sample(c(LETTERS[1:5], NA), 50, replace = TRUE)
)
a_data_frame <- data.frame(
n = a_numeric_vector,
f = a_factor
)
a_linear_model <- lm(dist ~ speed, cars)
# Call summary() on the numeric vector
summary(a_numeric_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08694 0.58120 1.06400 1.63500 1.48800 7.43600
# Do the same for the other three objects
summary(a_factor)
## A B C D E NA's
## 5 9 8 11 11 6
summary(a_data_frame)
## n f
## Min. :0.08694 A : 5
## 1st Qu.:0.58121 B : 9
## Median :1.06361 C : 8
## Mean :1.63546 D :11
## 3rd Qu.:1.48764 E :11
## Max. :7.43560 NA's: 6
summary(a_linear_model)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
type_info <-
function(x)
{
c(
class = class(x),
typeof = typeof(x),
mode = mode(x),
storage.mode = storage.mode(x)
)
}
# Create list of example variables
some_vars <- list(
an_integer_vector = rpois(24, lambda = 5),
a_numeric_vector = rbeta(24, shape1 = 1, shape2 = 1),
an_integer_array = array(rbinom(24, size = 8, prob = 0.5), dim = c(2, 3, 4)),
a_numeric_array = array(rweibull(24, shape = 1, scale = 1), dim = c(2, 3, 4)),
a_data_frame = data.frame(int = rgeom(24, prob = 0.5), num = runif(24)),
a_factor = factor(month.abb),
a_formula = y ~ x,
a_closure_function = mean,
a_builtin_function = length,
a_special_function = `if`
)
# Loop over some_vars calling type_info() on each element to explore them
lapply(some_vars, FUN=type_info)
## $an_integer_vector
## class typeof mode storage.mode
## "integer" "integer" "numeric" "integer"
##
## $a_numeric_vector
## class typeof mode storage.mode
## "numeric" "double" "numeric" "double"
##
## $an_integer_array
## class typeof mode storage.mode
## "array" "integer" "numeric" "integer"
##
## $a_numeric_array
## class typeof mode storage.mode
## "array" "double" "numeric" "double"
##
## $a_data_frame
## class typeof mode storage.mode
## "data.frame" "list" "list" "list"
##
## $a_factor
## class typeof mode storage.mode
## "factor" "integer" "numeric" "integer"
##
## $a_formula
## class typeof mode storage.mode
## "formula" "language" "call" "language"
##
## $a_closure_function
## class typeof mode storage.mode
## "function" "closure" "function" "function"
##
## $a_builtin_function
## class typeof mode storage.mode
## "function" "builtin" "function" "function"
##
## $a_special_function
## class typeof mode storage.mode
## "function" "special" "function" "function"
whiteChess <- list(king="g1", queen="h4", bishops=c("c2", "g5"), knights=character(0), rooks=c("f1", "f6"), pawns=c("a2", "b2", "d4", "e3", "g2", "h2"))
blackChess <- list(king="g8", queen="d7", bishops=c("b7", "e7"), knights=character(0), rooks=c("a6", "f8"), pawns=c("a5", "c3", "c4", "d5", "f7", "g6"))
chess <- list(white=whiteChess, black=blackChess)
# Explore the structure of chess
str(chess)
## List of 2
## $ white:List of 6
## ..$ king : chr "g1"
## ..$ queen : chr "h4"
## ..$ bishops: chr [1:2] "c2" "g5"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "f1" "f6"
## ..$ pawns : chr [1:6] "a2" "b2" "d4" "e3" ...
## $ black:List of 6
## ..$ king : chr "g8"
## ..$ queen : chr "d7"
## ..$ bishops: chr [1:2] "b7" "e7"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "a6" "f8"
## ..$ pawns : chr [1:6] "a5" "c3" "c4" "d5" ...
# Override the class of chess
class(chess) <- "chess_game"
# Is chess still a list?
is.list(chess)
## [1] TRUE
# How many pieces are left on the board?
length(unlist(chess))
## [1] 24
type_info(chess) # note that typeof(), mode(), and storage.mode() all remained as list
## class typeof mode storage.mode
## "chess_game" "list" "list" "list"
Chapter 2 - Using S3
Function overloading is the property of a function of input-dependent behavior:
Methodical Thinking - determining which methods are available for an S3 generic:
S3 and Primitive Functions:
Too Much Class:
Example code includes:
# Create get_n_elements
get_n_elements <- function(x, ...) {
UseMethod("get_n_elements")
}
# View get_n_elements
get_n_elements
## function(x, ...) {
## UseMethod("get_n_elements")
## }
# Create a data.frame method for get_n_elements
get_n_elements.data.frame <- function(x, ...) {
nrow(x) * ncol(x)
}
# Call the method on the sleep dataset
n_elements_sleep <- get_n_elements(sleep)
# View the result
n_elements_sleep
## [1] 60
# View pre-defined objects
# ls.str() ## Do not run, this can be a cluster with many variables loaded . . .
# Create a default method for get_n_elements
get_n_elements.default <- function(x, ...) {
length(unlist(x))
}
# Call the method on the ability.cov dataset
n_elements_ability.cov <- get_n_elements(ability.cov)
# Find methods for print
methods("print")
## [1] print.acf*
## [2] print.AES*
## [3] print.all_vars*
## [4] print.anova*
## [5] print.any_vars*
## [6] print.aov*
## [7] print.aovlist*
## [8] print.ar*
## [9] print.Arima*
## [10] print.arima0*
## [11] print.AsIs
## [12] print.aspell*
## [13] print.aspell_inspect_context*
## [14] print.bibentry*
## [15] print.Bibtex*
## [16] print.BoolResult*
## [17] print.browseVignettes*
## [18] print.by
## [19] print.bytes*
## [20] print.changedFiles*
## [21] print.check_code_usage_in_package*
## [22] print.check_compiled_code*
## [23] print.check_demo_index*
## [24] print.check_depdef*
## [25] print.check_details*
## [26] print.check_doi_db*
## [27] print.check_dotInternal*
## [28] print.check_make_vars*
## [29] print.check_nonAPI_calls*
## [30] print.check_package_code_assign_to_globalenv*
## [31] print.check_package_code_attach*
## [32] print.check_package_code_data_into_globalenv*
## [33] print.check_package_code_startup_functions*
## [34] print.check_package_code_syntax*
## [35] print.check_package_code_unload_functions*
## [36] print.check_package_compact_datasets*
## [37] print.check_package_CRAN_incoming*
## [38] print.check_package_datasets*
## [39] print.check_package_depends*
## [40] print.check_package_description*
## [41] print.check_package_description_encoding*
## [42] print.check_package_license*
## [43] print.check_packages_in_dir*
## [44] print.check_packages_in_dir_changes*
## [45] print.check_packages_used*
## [46] print.check_po_files*
## [47] print.check_Rd_contents*
## [48] print.check_Rd_line_widths*
## [49] print.check_Rd_metadata*
## [50] print.check_Rd_xrefs*
## [51] print.check_so_symbols*
## [52] print.check_T_and_F*
## [53] print.check_url_db*
## [54] print.check_vignette_index*
## [55] print.checkDocFiles*
## [56] print.checkDocStyle*
## [57] print.checkFF*
## [58] print.checkRd*
## [59] print.checkReplaceFuns*
## [60] print.checkS3methods*
## [61] print.checkTnF*
## [62] print.checkVignettes*
## [63] print.citation*
## [64] print.codoc*
## [65] print.codocClasses*
## [66] print.codocData*
## [67] print.colorConverter*
## [68] print.compactPDF*
## [69] print.condition
## [70] print.connection
## [71] print.CRAN_package_reverse_dependencies_and_views*
## [72] print.data.frame
## [73] print.Date
## [74] print.default
## [75] print.dendrogram*
## [76] print.density*
## [77] print.dictionary*
## [78] print.difftime
## [79] print.dist*
## [80] print.Dlist
## [81] print.DLLInfo
## [82] print.DLLInfoList
## [83] print.DLLRegisteredRoutines
## [84] print.dummy_coef*
## [85] print.dummy_coef_list*
## [86] print.ecdf*
## [87] print.element*
## [88] print.factanal*
## [89] print.factor
## [90] print.family*
## [91] print.fileSnapshot*
## [92] print.findLineNumResult*
## [93] print.flatGridListing*
## [94] print.formula*
## [95] print.frame*
## [96] print.fseq*
## [97] print.ftable*
## [98] print.fun_list*
## [99] print.function
## [100] print.getAnywhere*
## [101] print.ggplot*
## [102] print.ggplot2_bins*
## [103] print.ggproto*
## [104] print.ggproto_method*
## [105] print.gList*
## [106] print.glm*
## [107] print.glue*
## [108] print.gpar*
## [109] print.grob*
## [110] print.gtable*
## [111] print.hclust*
## [112] print.help_files_with_topic*
## [113] print.hexmode
## [114] print.HoltWinters*
## [115] print.hsearch*
## [116] print.hsearch_db*
## [117] print.htest*
## [118] print.html*
## [119] print.html_dependency*
## [120] print.indexed*
## [121] print.infl*
## [122] print.integrate*
## [123] print.isoreg*
## [124] print.kmeans*
## [125] print.knitr_kable*
## [126] print.Latex*
## [127] print.LaTeX*
## [128] print.lazy*
## [129] print.libraryIQR
## [130] print.listof
## [131] print.lm*
## [132] print.loadings*
## [133] print.location*
## [134] print.loess*
## [135] print.logLik*
## [136] print.ls_str*
## [137] print.medpolish*
## [138] print.MethodsFunction*
## [139] print.mtable*
## [140] print.NativeRoutineList
## [141] print.news_db*
## [142] print.nls*
## [143] print.noquote
## [144] print.numeric_version
## [145] print.object_size*
## [146] print.octmode
## [147] print.packageDescription*
## [148] print.packageInfo
## [149] print.packageIQR*
## [150] print.packageStatus*
## [151] print.pairwise.htest*
## [152] print.path*
## [153] print.PDF_Array*
## [154] print.PDF_Dictionary*
## [155] print.pdf_doc*
## [156] print.pdf_fonts*
## [157] print.PDF_Indirect_Reference*
## [158] print.pdf_info*
## [159] print.PDF_Keyword*
## [160] print.PDF_Name*
## [161] print.PDF_Stream*
## [162] print.PDF_String*
## [163] print.person*
## [164] print.POSIXct
## [165] print.POSIXlt
## [166] print.power.htest*
## [167] print.ppr*
## [168] print.prcomp*
## [169] print.princomp*
## [170] print.proc_time
## [171] print.quosure*
## [172] print.quoted*
## [173] print.R6*
## [174] print.R6ClassGenerator*
## [175] print.raster*
## [176] print.Rcpp_stack_trace*
## [177] print.Rd*
## [178] print.recordedplot*
## [179] print.rel*
## [180] print.restart
## [181] print.RGBcolorConverter*
## [182] print.rle
## [183] print.roman*
## [184] print.root_criterion*
## [185] print.rowwise_df*
## [186] print.SavedPlots*
## [187] print.sessionInfo*
## [188] print.shiny.tag*
## [189] print.shiny.tag.list*
## [190] print.simple.list
## [191] print.smooth.spline*
## [192] print.socket*
## [193] print.split*
## [194] print.src*
## [195] print.srcfile
## [196] print.srcref
## [197] print.stepfun*
## [198] print.stl*
## [199] print.StructTS*
## [200] print.subdir_tests*
## [201] print.summarize_CRAN_check_status*
## [202] print.summary.aov*
## [203] print.summary.aovlist*
## [204] print.summary.ecdf*
## [205] print.summary.glm*
## [206] print.summary.lm*
## [207] print.summary.loess*
## [208] print.summary.manova*
## [209] print.summary.nls*
## [210] print.summary.packageStatus*
## [211] print.summary.ppr*
## [212] print.summary.prcomp*
## [213] print.summary.princomp*
## [214] print.summary.table
## [215] print.summaryDefault
## [216] print.table
## [217] print.tables_aov*
## [218] print.tbl*
## [219] print.tbl_cube*
## [220] print.tbl_df*
## [221] print.terms*
## [222] print.theme*
## [223] print.trans*
## [224] print.trunc_mat*
## [225] print.ts*
## [226] print.tskernel*
## [227] print.TukeyHSD*
## [228] print.tukeyline*
## [229] print.tukeysmooth*
## [230] print.undoc*
## [231] print.uneval*
## [232] print.unit*
## [233] print.viewport*
## [234] print.vignette*
## [235] print.warnings
## [236] print.xgettext*
## [237] print.xngettext*
## [238] print.xtabs*
## see '?methods' for accessing help and source code
# Commented due to no dataset "hair" on my machine
# View the structure of hair
# str(hair)
# What primitive generics are available?
.S3PrimitiveGenerics
## [1] "anyNA" "as.character" "as.complex" "as.double"
## [5] "as.environment" "as.integer" "as.logical" "as.numeric"
## [9] "as.raw" "c" "dim" "dim<-"
## [13] "dimnames" "dimnames<-" "is.array" "is.finite"
## [17] "is.infinite" "is.matrix" "is.na" "is.nan"
## [21] "is.numeric" "length" "length<-" "levels<-"
## [25] "names" "names<-" "rep" "seq.int"
## [29] "xtfrm"
# Does length.hairstylist exist?
# exists("length.hairstylist")
# What is the length of hair?
# length(hair)
kitty <- "Miaow!"
# Assign classes
class(kitty) <- c("cat", "mammal", "character")
# Does kitty inherit from cat/mammal/character vector?
inherits(kitty, "cat")
## [1] TRUE
inherits(kitty, "mammal")
## [1] TRUE
inherits(kitty, "character")
## [1] TRUE
# Is kitty a character vector?
is.character(kitty)
## [1] TRUE
# Does kitty inherit from dog?
inherits(kitty, "dog")
## [1] FALSE
what_am_i <-
function(x, ...)
{
UseMethod("what_am_i")
}
# cat method
what_am_i.cat <- function(x, ...)
{
# Write a message
print("I'm a cat")
# Call NextMethod
NextMethod("what_am_i")
}
# mammal method
what_am_i.mammal <- function(x, ...)
{
# Write a message
print("I'm a mammal")
# Call NextMethod
NextMethod("what_am_i")
}
# character method
what_am_i.character <- function(x, ...)
{
# Write a message
print("I'm a character vector")
}
# Call what_am_i()
what_am_i(kitty)
## [1] "I'm a cat"
## [1] "I'm a mammal"
## [1] "I'm a character vector"
Chapter 3 - Using R6
Object factory - R6 provides a means of storing data and objects within the same variable:
Hiding Complexity with Encapsulation - should be able to use something even if the internal (hidden) functionality is very complicated:
Generally, data available in the “private” area of a class is not available to users:
Example code includes:
# Define microwave_oven_factory
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private=list(power_rating_watts=800)
)
# View the microwave_oven_factory
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## clone: function (deep = FALSE)
## Private:
## power_rating_watts: 800
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Make a new microwave oven
microwave_oven <- microwave_oven_factory$new()
# Add a cook method to the factory definition
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
}
)
)
# Create microwave oven object
a_microwave_oven <- microwave_oven_factory$new()
# Call cook method for 1 second
a_microwave_oven$cook(time_seconds=1)
## [1] "Your food is cooked!"
# Add a close_door() method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
}
)
)
# Add an initialize method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
},
# Add initialize() method here
initialize = function(power_rating_watts, door_is_open) {
if (!missing(power_rating_watts)) {
private$power_rating_watts <- power_rating_watts
}
if (!missing(door_is_open)) {
private$door_is_open <- door_is_open
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new(power_rating_watts=650, door_is_open=TRUE)
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800
),
active = list(
# add the binding here
power_rating_watts = function() {
private$..power_rating_watts
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power rating
a_microwave_oven$power_rating_watts
## [1] 800
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800,
..power_level_watts = 800
),
# Add active list containing an active binding
active=list(
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
assertive.types::assert_is_a_number(value, severity="warning")
assertive.numbers::assert_all_are_in_closed_range(value,
lower=0,
upper=private$..power_rating_watts,
severity="warning"
)
private$..power_level_watts <- value
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power level
a_microwave_oven$power_level_watts
## [1] 800
# Try to set the power level to "400"
a_microwave_oven$power_level_watts <- "400"
## Warning in (function (value) : is_a_number : value is not of class
## 'numeric'; it has class 'character'.
## Warning: Coercing value to class 'numeric'.
# Try to set the power level to 1600 watts
a_microwave_oven$power_level_watts <- 1600
## Warning in (function (value) : is_in_closed_range : value are not all in the range [0,800].
## There was 1 failure:
## Position Value Cause
## 1 1 1600 too high
# Set the power level to 400 watts
a_microwave_oven$power_level_watts <- 400
Chapter 4 - R6 Inheritance
Inheritance is an attempt to avoid “copy and paste” from one class to another (dependent, fancier, or the like) class:
Extend or Override to create additional functionality:
Multiple Levels of Inheritance - a can inherit from b that inherited from c and the like:
Example code includes:
microwave_oven_factory <-
R6::R6Class("MicrowaveOven",
private=list(..power_rating_watts=800,
..power_level_watts=800,
..door_is_open=FALSE
),
public=list(cook=function(time) Sys.sleep(time),
open_door=function() private$..door_is_open <- TRUE,
close_door = function() private$..door_is_open <- FALSE
),
active=list(power_rating_watts=function() private$..power_rating_watts,
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
private$..power_level_watts <-
max(0,
min(private$..power_rating_watts,
as.numeric(value)
)
)
}
}
)
)
# Explore the microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a fancy microwave class inheriting from microwave oven
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit=microwave_oven_factory
)
# Explore microwave oven classes
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Inherits from: <microwave_oven_factory>
## Public:
## clone: function (deep = FALSE)
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Instantiate both types of microwave
a_microwave_oven <- microwave_oven_factory$new()
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Get power rating for each microwave
microwave_power_rating <- a_microwave_oven$power_level_watts
fancy_microwave_power_rating <- a_fancy_microwave$power_level_watts
# Verify that these are the same
identical(microwave_power_rating, fancy_microwave_power_rating)
## [1] TRUE
# Cook with each microwave
a_microwave_oven$cook(1)
a_fancy_microwave$cook(1)
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Extend the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook baked potato method
public = list(
cook_baked_potato=function() {
self$cook(3)
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook_baked_potato() method
a_fancy_microwave$cook_baked_potato()
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Update the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook method
public = list(
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook() method
a_fancy_microwave$cook(1)
## Enjoy your dinner!
# Expose the parent functionality
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
public = list(
cook_baked_potato = function() {
self$cook(3)
},
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
),
# Add an active element with a super_ binding
active = list(
super_ = function() super
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the super_ binding
a_fancy_microwave$super_
## <environment: 0x000000000c9a1b50>
ascii_pizza_slice <- " __\n // \"\"--.._\n|| (_) _ \"-._\n|| _ (_) '-.\n|| (_) __..-'\n \\\\__..--\"\""
# Explore other microwaves
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Inherits from: <microwave_oven_factory>
## Public:
## cook_baked_potato: function ()
## cook: function (time_seconds)
## clone: function (deep = FALSE)
## Active bindings:
## super_: function ()
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a high-end microwave oven class
high_end_microwave_oven_factory <- R6::R6Class(
"HighEndMicrowaveOven",
inherit=fancy_microwave_oven_factory,
public=list(
cook=function(time_seconds) {
super$super_$cook(time_seconds)
message(ascii_pizza_slice)
}
)
)
# Instantiate a high-end microwave oven
a_high_end_microwave <- high_end_microwave_oven_factory$new()
# Use it to cook for one second
a_high_end_microwave$cook(1)
## __
## // ""--.._
## || (_) _ "-._
## || _ (_) '-.
## || (_) __..-'
## \\__..--""
Chapter 5 - Advanced R6 Usage
Environments, Reference Behavior, and Static Fields:
Cloning Objects - R6 is built using environments, so the “copy by reference” is part and parcel of R6:
Shut it Down - if the R6 object is linked to any databases or has any side effects, it can be a good idea to shut it down:
Example code includes:
# Define a new environment
env <- new.env()
# Add an element named perfect
env$perfect <- c(6, 28, 496)
# Add an element named bases
env[["bases"]] <- c("A", "C", "G", "T")
# Assign lst and env
lst <- list(
perfect = c(6, 28, 496),
bases = c("A", "C", "G", "T")
)
env <- list2env(lst)
# Copy lst
lst2 <- lst
# Change lst's bases element
lst$bases <- c("A", "C", "G", "U")
# Test lst and lst2 identical
identical(lst$bases, lst2$bases)
## [1] FALSE
# Copy env
env2 <- env
# Change env's bases element
env$bases <- c("A", "C", "G", "U")
# Test env and env2 identical
identical(env$bases, env2$bases)
## [1] TRUE
# Complete the class definition
env_microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
shared = {
# Create a new environment named e
e <- new.env()
# Assign safety_warning into e
e$safety_warning <- "Warning. Do not try to cook metal objects."
# Return e
e
}
),
active = list(
# Add the safety_warning binding
safety_warning = function(value) {
if (missing(value)) {
private$shared$safety_warning
} else {
private$shared$safety_warning <- value
}
}
)
)
# Create two microwave ovens
a_microwave_oven <- env_microwave_oven_factory$new()
another_microwave_oven <- env_microwave_oven_factory$new()
# Change the safety warning for a_microwave_oven
a_microwave_oven$safety_warning <- "Warning. If the food is too hot you may scald yourself."
# Verify that the warning has change for another_microwave
another_microwave_oven$safety_warning
## [1] "Warning. If the food is too hot you may scald yourself."
# Still uses microwave_oven_factory as defined in Chapter 4
# Create a microwave oven
a_microwave_oven <- microwave_oven_factory$new()
# Copy a_microwave_oven using <-
assigned_microwave_oven <- a_microwave_oven
# Copy a_microwave_oven using clone()
cloned_microwave_oven <- a_microwave_oven$clone()
# Change a_microwave_oven's power level
a_microwave_oven$power_level_watts <- 400
# Check a_microwave_oven & assigned_microwave_oven same
identical(a_microwave_oven$power_level_watts, assigned_microwave_oven$power_level_watts)
## [1] TRUE
# Check a_microwave_oven & cloned_microwave_oven different
!identical(a_microwave_oven$power_level_watts, cloned_microwave_oven$power_level_watts)
## [1] TRUE
# Commented, due to never defined power_plug
# Create a microwave oven
# a_microwave_oven <- microwave_oven_factory$new()
# Look at its power plug
# a_microwave_oven$power_plug
# Copy a_microwave_oven using clone(), no args
# cloned_microwave_oven <- a_microwave_oven$clone()
# Copy a_microwave_oven using clone(), deep = TRUE
# deep_cloned_microwave_oven <- a_microwave_oven$clone(deep=TRUE)
# Change a_microwave_oven's power plug type
# a_microwave_oven$power_plug$type <- "British"
# Check a_microwave_oven & cloned_microwave_oven same
# identical(a_microwave_oven$power_plug$type, cloned_microwave_oven$power_plug$type)
# Check a_microwave_oven & deep_cloned_microwave_oven different
# !identical(a_microwave_oven$power_plug$type, deep_cloned_microwave_oven$power_plug$type)
# Commented due to not having this SQL database
# Microwave_factory is pre-defined
# microwave_oven_factory
# Complete the class definition
# smart_microwave_oven_factory <- R6::R6Class(
# "SmartMicrowaveOven",
# inherit = microwave_oven_factory, # Specify inheritance
# private = list(
# conn = NULL
# ),
# public = list(
# initialize = function() {
# # Connect to the database
# private$conn = dbConnect(SQLite(), "cooking-times.sqlite")
# },
# get_cooking_time = function(food) {
# dbGetQuery(
# private$conn,
# sprintf("SELECT time_seconds FROM cooking_times WHERE food = '%s'", food)
# )
# },
# finalize = function() {
# message("Disconnecting from the cooking times database.")
# dbDisconnect(private$conn)
# }
# )
# )
# Create a smart microwave object
# a_smart_microwave <- smart_microwave_oven_factory$new()
# Call the get_cooking_time() method
# a_smart_microwave$get_cooking_time("soup")
# Remove the smart microwave
# rm(a_smart_microwave)
# Force garbage collection
# gc()
A nice introduction to S3 and R6.
Chapter 1 - What is Machine Learning?
Machine learning is the process of constructing and using algorithms that learn from data:
Classification, Regression, Clustering are three common forms of machine learning problems:
Supervised vs Unsupervised Learning:
Example code includes:
data(iris, package="datasets")
# Reveal number of observations and variables in two different ways
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(iris)
## [1] 150 5
# Show first and last observations in the iris data set
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
# Summarize the iris data set
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
data(Wage, package="ISLR")
# Build Linear Model: lm_wage (coded already)
lm_wage <- lm(wage ~ age, data = Wage)
# Define data.frame: unseen (coded already)
unseen <- data.frame(age = 60)
# Predict the wage for a 60-year old worker
predict(lm_wage, unseen)
## 1
## 124.1413
emails <- data.frame(
avg_capital_seq=c( 1, 2.11, 4.12, 1.86, 2.97, 1.69, 5.891, 3.17, 1.23, 2.44, 3.56, 3.25, 1.33 ),
spam=as.integer(c( 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1 ))
)
str(emails)
## 'data.frame': 13 obs. of 2 variables:
## $ avg_capital_seq: num 1 2.11 4.12 1.86 2.97 ...
## $ spam : int 0 0 1 0 1 0 1 0 0 1 ...
# Show the dimensions of emails
dim(emails)
## [1] 13 2
# Inspect definition of spam_classifier()
spam_classifier <- function(x){
prediction <- rep(NA, length(x)) # initialize prediction vector
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(prediction) # prediction is either 0 or 1
}
# Apply the classifier to the avg_capital_seq column: spam_pred
spam_pred <- spam_classifier(emails$avg_capital_seq)
# Compare spam_pred to emails$spam. Use ==
spam_pred == emails$spam
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
linkedin <- c( 5, 7, 4, 9, 11, 10, 14, 17, 13, 11, 18, 17, 21, 21, 24, 23, 28, 35, 21, 27, 23 )
# Create the days vector
days <- 1:length(linkedin)
# Fit a linear model called on the linkedin views per day: linkedin_lm
linkedin_lm <- lm(linkedin ~ days)
# Predict the number of views for the next three days: linkedin_pred
future_days <- data.frame(days = 22:24)
linkedin_pred <- predict(linkedin_lm, future_days)
# Plot historical data and predictions
plot(linkedin ~ days, xlim = c(1, 24))
points(22:24, linkedin_pred, col = "green")
# Chop up iris in my_iris and species
my_iris <- iris[-5]
species <- iris$Species
# Perform k-means clustering on my_iris: kmeans_iris
kmeans_iris <- kmeans(my_iris, 3)
# Compare the actual Species to the clustering using table()
table(kmeans_iris$cluster, species)
## species
## setosa versicolor virginica
## 1 50 0 0
## 2 0 2 36
## 3 0 48 14
# Plot Petal.Width against Petal.Length, coloring by cluster
plot(Petal.Length ~ Petal.Width, data = my_iris, col = kmeans_iris$cluster)
# Take a look at the iris dataset
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# A decision tree model has been built for you
tree <- rpart::rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
data = iris, method = "class")
# A dataframe containing unseen observations
unseen <- data.frame(Sepal.Length = c(5.3, 7.2),
Sepal.Width = c(2.9, 3.9),
Petal.Length = c(1.7, 5.4),
Petal.Width = c(0.8, 2.3)
)
# Predict the label of the unseen observations. Print out the result.
predict(tree, unseen, type="class")
## 1 2
## setosa virginica
## Levels: setosa versicolor virginica
data(mtcars, package="datasets")
cars <- mtcars[,c("wt", "hp")]
str(cars)
## 'data.frame': 32 obs. of 2 variables:
## $ wt: num 2.62 2.88 2.32 3.21 3.44 ...
## $ hp: num 110 110 93 110 175 105 245 62 95 123 ...
# Explore the cars dataset
str(cars)
## 'data.frame': 32 obs. of 2 variables:
## $ wt: num 2.62 2.88 2.32 3.21 3.44 ...
## $ hp: num 110 110 93 110 175 105 245 62 95 123 ...
summary(cars)
## wt hp
## Min. :1.513 Min. : 52.0
## 1st Qu.:2.581 1st Qu.: 96.5
## Median :3.325 Median :123.0
## Mean :3.217 Mean :146.7
## 3rd Qu.:3.610 3rd Qu.:180.0
## Max. :5.424 Max. :335.0
# Group the dataset into two clusters: km_cars
km_cars <- kmeans(cars, 2)
# Print out the contents of each cluster
km_cars$cluster
## Mazda RX4 Mazda RX4 Wag Datsun 710
## 2 2 2
## Hornet 4 Drive Hornet Sportabout Valiant
## 2 1 2
## Duster 360 Merc 240D Merc 230
## 1 2 2
## Merc 280 Merc 280C Merc 450SE
## 2 2 1
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 1 1 1
## Lincoln Continental Chrysler Imperial Fiat 128
## 1 1 2
## Honda Civic Toyota Corolla Toyota Corona
## 2 2 2
## Dodge Challenger AMC Javelin Camaro Z28
## 2 2 1
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 1 2 2
## Lotus Europa Ford Pantera L Ferrari Dino
## 2 1 1
## Maserati Bora Volvo 142E
## 1 2
# Group the dataset into two clusters: km_cars
km_cars <- kmeans(cars, 2)
# Add code: color the points in the plot based on the clusters
plot(cars, col=km_cars$cluster)
# Print out the cluster centroids
km_cars$centers
## wt hp
## 1 2.692000 99.47368
## 2 3.984923 215.69231
# Replace the ___ part: add the centroids to the plot
points(km_cars$centers, pch = 22, bg = c(1, 2), cex = 2)
Chapter 2 - Performance Measures
Measuring model performance or error - is the model good?
Training set and test set - power is about the ability to make predictions about unseen data:
Bias and variance are the main error sources for a predictive model:
Example code includes:
library(dplyr)
data(titanic_train, package="titanic")
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=factor(Sex)) %>%
na.omit()
# Have a look at the structure of titanic
str(titanic)
## 'data.frame': 714 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 1 1 1 2 2 2 1 1 1 ...
## $ Pclass : int 3 1 3 1 3 1 3 3 2 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# A decision tree classification model is built on the data
tree <- rpart::rpart(Survived ~ ., data = titanic, method = "class")
# Use the predict() method to make predictions, assign to pred
pred <- predict(tree, titanic, type="class")
# Use the table() method to make the confusion matrix
(conf <- table(titanic$Survived, pred))
## pred
## 1 0
## 1 212 78
## 0 53 371
# Assign TP, FN, FP and TN using conf
TP <- conf[1, 1] # this will be 212
FN <- conf[1, 2] # this will be 78
FP <- conf[2, 1] # fill in
TN <- conf[2, 2] # fill in
# Calculate and print the accuracy: acc
(acc <- sum(TP, TN) / sum(conf))
## [1] 0.8165266
# Calculate and print out the precision: prec
(prec <- TP / (TP + FP))
## [1] 0.8
# Calculate and print out the recall: rec
(rec <- TP / (TP + FN))
## [1] 0.7310345
# DO NOT HAVE THIS DATASET
# Take a look at the structure of air
# str(air)
# Inspect your colleague's code to build the model
# fit <- lm(dec ~ freq + angle + ch_length, data = air)
# Use the model to predict for all values: pred
# pred <- predict(fit, air)
# Use air$dec and pred to calculate the RMSE
# rmse <- sqrt( mean((air$dec-pred)^2) )
# Print out rmse
# rmse
# Previous model
# fit <- lm(dec ~ freq + angle + ch_length, data = air)
# pred <- predict(fit)
# rmse <- sqrt(sum( (air$dec - pred) ^ 2) / nrow(air))
# rmse
# Your colleague's more complex model
# fit2 <- lm(dec ~ freq + angle + ch_length + velocity + thickness, data = air)
# Use the model to predict for all values: pred2
# pred2 <- predict(fit2)
# Calculate rmse2
# rmse2 <- sqrt(sum( (air$dec - pred2) ^ 2) / nrow(air))
# Print out rmse2
# rmse2
# ALSO DO NOT HAVE THIS DATASET, THOUGH IT IS AVAILABLE ON UCI
# Explore the structure of the dataset
seeds <- read.delim("seeds.txt", header=FALSE,
col.names=c("area", "perimeter", "compactness", "length",
"width", "asymmetry", "groove", "type"
)
)
str(seeds)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness: num 0.871 0.881 0.905 0.895 0.903 ...
## $ length : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry : num 2.22 1.02 2.7 2.26 1.35 ...
## $ groove : num 5.22 4.96 4.83 4.8 5.17 ...
## $ type : int 1 1 1 1 1 1 1 1 1 1 ...
# Group the seeds in three clusters
km_seeds <- kmeans(seeds[,-8], 3)
# Color the points in the plot based on the clusters
plot(length ~ compactness, data = seeds, col=km_seeds$cluster)
# Print out the ratio of the WSS to the BSS
with(km_seeds, tot.withinss / betweenss)
## [1] 0.2762846
# Shuffle the dataset, call the result shuffled
n <- nrow(titanic)
shuffled <- titanic[sample(n),]
# Split the data in train and test
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices, ]
test <- shuffled[-train_indices, ]
# Print the structure of train and test
str(train)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 2 2 2 2 2 1 1 1 1 ...
## $ Pclass : int 3 2 3 3 3 3 1 3 1 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 1 2 2 1 2 2 1 ...
## $ Age : num 17 25 25 2 2 70.5 33 29 36 5 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
str(test)
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 1 2 2 2 2 2 2 1 1 ...
## $ Pclass : int 1 3 3 3 2 3 1 2 3 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 2 1 2 1 2 2 2 2 ...
## $ Age : num 2 29 32 14 31 18 37 36.5 21 32 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Fill in the model that has been learned.
tree <- rpart::rpart(Survived ~ ., data=train, method = "class")
# Predict the outcome on the test set with tree: pred
pred <- predict(tree, newdata=test, type="class")
# Calculate the confusion matrix: conf
(conf <- table(test$Survived, pred))
## pred
## 1 0
## 1 62 32
## 0 21 99
# Initialize the accs vector
accs <- rep(0,6)
for (i in 1:6) {
# These indices indicate the interval of the test set
indices <- (((i-1) * round((1/6)*nrow(shuffled))) + 1):((i*round((1/6) * nrow(shuffled))))
# Exclude them from the train set
train <- shuffled[-indices,]
# Include them in the test set
test <- shuffled[indices,]
# A model is learned using each training set
tree <- rpart::rpart(Survived ~ ., train, method = "class")
# Make a prediction on the test set using tree
pred <- predict(tree, newdata=test, type="class")
# Assign the confusion matrix to conf
conf <- table(test$Survived, pred)
# Assign the accuracy of this model to the ith index in accs
accs[i] <- sum(diag(conf))/sum(conf)
}
# Print out the mean of accs
mean(accs)
## [1] 0.7955182
data(spam, package="kernlab")
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ type : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
emails_full <- spam %>%
select(capitalAve, type) %>%
mutate(avg_capital_seq=capitalAve, spam=factor(as.integer(type)-1, levels=c(1, 0))) %>%
select(avg_capital_seq, spam)
str(emails_full)
## 'data.frame': 4601 obs. of 2 variables:
## $ avg_capital_seq: num 3.76 5.11 9.82 3.54 3.54 ...
## $ spam : Factor w/ 2 levels "1","0": 1 1 1 1 1 1 1 1 1 1 ...
# The spam filter that has been 'learned' for you
spam_classifier <- function(x){
prediction <- rep(NA, length(x)) # initialize prediction vector
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(factor(prediction, levels = c("1", "0"))) # prediction is either 0 or 1
}
# Apply spam_classifier to emails_full: pred_full
pred_full <- spam_classifier(emails_full$avg_capital_seq)
# Build confusion matrix for emails_full: conf_full
conf_full <- table(emails_full$spam, pred_full)
# Calculate the accuracy with conf_full: acc_full
(acc_full <- sum(diag(conf_full)) / sum(conf_full))
## [1] 0.6561617
emails_small <- data.frame(avg_capital_seq=c( 1, 2.112, 4.123, 1.863, 2.973, 1.687, 5.891,
3.167, 1.23, 2.441, 3.555, 3.25, 1.333
),
spam=factor(c(0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1), levels=c(1, 0))
)
str(emails_small)
## 'data.frame': 13 obs. of 2 variables:
## $ avg_capital_seq: num 1 2.11 4.12 1.86 2.97 ...
## $ spam : Factor w/ 2 levels "1","0": 2 2 1 2 1 2 1 2 2 1 ...
spam_classifier <- function(x){
prediction <- rep(NA, length(x))
prediction[x > 4] <- 1
prediction[x <= 4] <- 0
return(factor(prediction, levels = c("1", "0")))
}
# conf_small and acc_small have been calculated for you
conf_small <- table(emails_small$spam, spam_classifier(emails_small$avg_capital_seq))
acc_small <- sum(diag(conf_small)) / sum(conf_small)
acc_small
## [1] 0.7692308
# Apply spam_classifier to emails_full and calculate the confusion matrix: conf_full
conf_full <- table(emails_full$spam, spam_classifier(emails_full$avg_capital_seq))
# Calculate acc_full
(acc_full <- sum(diag(conf_full)) / sum(conf_full))
## [1] 0.7259291
Chapter 3 - Classification
Decision trees - assign class to an unseen observation (each observation consists of a vector of features, and a classification):
K-nearest-neighbors (knn) - an example of “instance based learning”:
ROC curve - Receiver Operator Characteristic curve - is a powerful performance measure for binary classification:
Example code includes:
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=factor(Sex), Pclass=factor(Pclass)) %>%
na.omit()
trIdx <- sample(x=1:nrow(titanic), size=round(.7*nrow(titanic)), replace=FALSE)
train <- titanic[trIdx, ]
test <- titanic[-trIdx, ]
str(train); str(test)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 2 1 1 1 2 2 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 2 3 2 1 1 2 3 3 3 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 1 1 2 2 1 2 2 ...
## $ Age : num 36 25 62 15 21 57 19 35 31 38 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 2 1 1 1 2 1 1 1 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 1 1 3 3 1 3 2 2 1 1 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 1 1 1 1 1 2 2 2 ...
## $ Age : num 35 54 27 4 58 14 55 34 28 40 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Fill in the ___, build a tree model: tree
tree <- rpart::rpart(Survived ~ ., data=train, method="class")
# Draw the decision tree
rattle::fancyRpartPlot(tree)
# Predict the values of the test set: pred
pred <- predict(tree, newdata=test, type="class")
# Construct the confusion matrix: conf
(conf <- table(test$Survived, pred))
## pred
## 1 0
## 1 71 24
## 0 22 97
# Print out the accuracy
sum(diag(conf)) / sum(conf)
## [1] 0.7850467
# Calculation of a complex tree
tree <- rpart::rpart(Survived ~ ., train, method = "class", control = rpart::rpart.control(cp=0.00001))
# Draw the complex tree
rattle::fancyRpartPlot(tree)
# Prune the tree: pruned
pruned <- rpart::prune(tree, cp=0.01)
# Draw pruned
rattle::fancyRpartPlot(pruned)
data(spam, package="kernlab")
spam <- spam %>%
mutate(spam=as.integer(type)-1L) %>%
select(-type)
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ spam : int 1 1 1 1 1 1 1 1 1 1 ...
idxTrain <- sample(x=1:nrow(spam), size=round(.7*nrow(spam)), replace=FALSE)
train <- spam[idxTrain, ]
test <- spam[-idxTrain, ]
dim(train); dim(test)
## [1] 3221 58
## [1] 1380 58
# Train and test tree with gini criterion
tree_g <- rpart::rpart(spam ~ ., train, method = "class")
pred_g <- predict(tree_g, test, type = "class")
conf_g <- table(test$spam, pred_g)
acc_g <- sum(diag(conf_g)) / sum(conf_g)
# Change the first line of code to use information gain as splitting criterion
tree_i <- rpart::rpart(spam ~ ., train, method = "class", parms = list(split = "information"))
pred_i <- predict(tree_i, test, type = "class")
conf_i <- table(test$spam, pred_i)
acc_i <- sum(diag(conf_i)) / sum(conf_i)
# Draw a fancy plot of both tree_g and tree_i
rattle::fancyRpartPlot(tree_g)
rattle::fancyRpartPlot(tree_i)
# Print out acc_g and acc_i
acc_g
## [1] 0.8869565
acc_i
## [1] 0.8971014
# Shuffle the dataset, call the result shuffled
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=as.integer(factor(Sex))-1L) %>%
na.omit()
n <- nrow(titanic)
shuffled <- titanic[sample(n),]
# Split the data in train and test
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices, ]
test <- shuffled[-train_indices, ]
# Print the structure of train and test
str(train)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 1 1 1 2 2 1 1 1 2 ...
## $ Pclass : int 3 1 1 1 3 3 2 2 1 3 ...
## $ Sex : int 0 1 0 1 1 1 0 1 1 1 ...
## $ Age : num 2 26 43 27 19 36 41 19 25 28 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
str(test)
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 2 2 2 2 1 1 2 2 2 ...
## $ Pclass : int 2 3 2 3 2 2 1 3 3 2 ...
## $ Sex : int 0 1 1 1 1 0 0 1 1 1 ...
## $ Age : num 28 17 34 28 29 36 19 22 25 30 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Store the Survived column of train and test in train_labels and test_labels
train_labels <- train$Survived
test_labels <- test$Survived
# Copy train and test to knn_train and knn_test
knn_train <- train
knn_test <- test
# Drop Survived column for knn_train and knn_test
knn_train$Survived <- NULL
knn_test$Survived <- NULL
# Normalize Pclass
min_class <- min(knn_train$Pclass)
max_class <- max(knn_train$Pclass)
knn_train$Pclass <- (knn_train$Pclass - min_class) / (max_class - min_class)
knn_test$Pclass <- (knn_test$Pclass - min_class) / (max_class - min_class)
# Normalize Age
min_age <- min(knn_train$Age)
max_age <- max(knn_train$Age)
knn_train$Age <- (knn_train$Age - min_age) / (max_age - min_age)
knn_test$Age <- (knn_test$Age - min_age) / (max_age - min_age)
# Fill in the ___, make predictions using knn: pred
pred <- class::knn(train = knn_train, test = knn_test, cl = train_labels, k = 5)
# Construct the confusion matrix: conf
(conf <- table(test_labels, pred))
## pred
## test_labels 1 0
## 1 60 28
## 0 12 114
range <- 1:round(0.2 * nrow(knn_train))
accs <- rep(0, length(range))
for (k in range) {
# Fill in the ___, make predictions using knn: pred
pred <- class::knn(knn_train, knn_test, cl=train_labels, k = k)
# Fill in the ___, construct the confusion matrix: conf
conf <- table(test_labels, pred)
# Fill in the ___, calculate the accuracy and store it in accs[k]
accs[k] <- sum(diag(conf)) / sum(conf)
}
# Plot the accuracies. Title of x-axis is "k".
plot(range, accs, xlab = "k")
# Calculate the best k
which.max(accs)
## [1] 3
# CAUTION - DO NOT HAVE THIS DATA, though UCIMLR (Census + Income) is the SOURCE
# test should be 9215 x 14 while train should be 21503 x 14
# income is the key variable, with 1 meaning > $50,000 while 0 meaning otherwise
# Build a tree on the training set: tree
# tree <- rpart::rpart(income ~ ., train, method = "class")
# Predict probability values using the model: all_probs
# all_probs <- predict(tree, newdata=test, type="prob")
# Print out all_probs
# str(all_probs)
# Select second column of all_probs: probs
# probs <- all_probs[, 2]
# Make a prediction object: pred
# pred <- ROCR::prediction(probs, test$income)
# Make a performance object: perf
# perf <- ROCR::performance(pred, "tpr", "fpr")
# Plot this curve
# plot(perf)
# Make a performance object: perf
# perf <- ROCR::performance(pred, "auc")
# Print out the AUC
# perf@y.values[[1]]
# EVEN MORE DATA THAT I DO NOT HAVE
draw_roc_lines <- function(tree, knn) {
if (!(class(tree)== "performance" && class(knn) == "performance") ||
!(attr(class(tree),"package") == "ROCR" && attr(class(knn),"package") == "ROCR")) {
stop("This predefined function needs two performance objects as arguments.")
} else if (length(tree@x.values) == 0 | length(knn@x.values) == 0) {
stop('This predefined function needs the right kind of performance objects as arguments. Are you sure you are creating both objects with arguments "tpr" and "fpr"?')
} else {
plot(0,0,
type = "n",
main = "ROC Curves",
ylab = "True positive rate",
xlab = "False positive rate",
ylim = c(0,1),
xlim = c(0,1))
lines(tree@x.values[[1]], tree@y.values[[1]], type = "l", lwd = 2, col = "red")
lines(knn@x.values[[1]], knn@y.values[[1]], type = "l", lwd = 2, col = "green")
legend("bottomright", c("DT","KNN"), lty=c(1,1),lwd=c(2.5,2.5),col=c("red","green"))
}
}
# Make the prediction objects for both models: pred_t, pred_k
# pred_t <- ROCR::prediction(probs_t, test$spam)
# pred_k <- ROCR::prediction(probs_k, test$spam)
# Make the performance objects for both models: perf_t, perf_k
# perf_t <- ROCR::performance(pred_t, "tpr", "fpr")
# perf_k <- ROCR::performance(pred_k, "tpr", "fpr")
# Draw the ROC lines using draw_roc_lines()
# draw_roc_lines(perf_t, perf_k)
Chapter 4 - Regression
Simple, Linear Regression - estimated an actual value rather than the class of an observation:
Multivariable Linear Regression - combining several predictors all in a single model:
k-Nearest-Neighbors and Generalization - solution to problem of not knowing what transformations to use:
Example code includes:
kang_nose <- data.frame(nose_width=c( 241, 222, 233, 207, 247, 189, 226, 240, 215, 231, 263, 220, 271, 284, 279, 272, 268, 278, 238, 255, 308, 281, 288, 306, 236, 204, 216, 225, 220, 219, 201, 213, 228, 234, 237, 217, 211, 238, 221, 281, 292, 251, 231, 275, 275 ) ,
nose_length=c( 609, 629, 620, 564, 645, 493, 606, 660, 630, 672, 778, 616, 727, 810, 778, 823, 755, 710, 701, 803, 855, 838, 830, 864, 635, 565, 562, 580, 596, 597, 636, 559, 615, 740, 677, 675, 629, 692, 710, 730, 763, 686, 717, 737, 816 )
)
str(kang_nose)
## 'data.frame': 45 obs. of 2 variables:
## $ nose_width : num 241 222 233 207 247 189 226 240 215 231 ...
## $ nose_length: num 609 629 620 564 645 493 606 660 630 672 ...
nose_width_new <- data.frame(nose_width=250)
# Plot nose length as function of nose width.
plot(kang_nose, xlab = "nose width", ylab = "nose length")
# Fill in the ___, describe the linear relationship between the two variables: lm_kang
lm_kang <- lm(nose_length ~ nose_width, data = kang_nose)
# Print the coefficients of lm_kang
lm_kang$coefficients
## (Intercept) nose_width
## 27.893058 2.701175
# Predict and print the nose length of the escaped kangoroo
predict(lm_kang, newdata=nose_width_new)
## 1
## 703.1869
# Build model and make plot
lm_kang <- lm(nose_length ~ nose_width, data=kang_nose)
plot(kang_nose, xlab = "nose width", ylab = "nose length")
abline(lm_kang$coefficients, col = "red")
# Apply predict() to lm_kang: nose_length_est
nose_length_est <- predict(lm_kang)
# Calculate difference between the predicted and the true values: res
res <- (kang_nose$nose_length - nose_length_est)
# Calculate RMSE, assign it to rmse and print it
(rmse <- sqrt( mean( res^2 ) ))
## [1] 43.26288
# Calculate the residual sum of squares: ss_res
ss_res <- sum(res^2)
# Determine the total sum of squares: ss_tot
ss_tot <- sum( (kang_nose$nose_length - mean(kang_nose$nose_length))^2 )
# Calculate R-squared and assign it to r_sq. Also print it.
(r_sq <- 1 - ss_res / ss_tot)
## [1] 0.7768914
# Apply summary() to lm_kang
summary(lm_kang)
##
## Call:
## lm(formula = nose_length ~ nose_width, data = kang_nose)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.876 -32.912 -4.855 30.227 86.307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.8931 54.2991 0.514 0.61
## nose_width 2.7012 0.2207 12.236 1.34e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.26 on 43 degrees of freedom
## Multiple R-squared: 0.7769, Adjusted R-squared: 0.7717
## F-statistic: 149.7 on 1 and 43 DF, p-value: 1.342e-15
cgdp <- c( 666.3, 5935.7, 4619.2, 7574.3, 3646.7, 13961.2, 51127.1, 7884.2, 295.1, 47516.5, 825.2, 720, 1096.6, 7712.8, 22245.5, 4796.2, 8040, 11612.5, 15199.3, 40776.3, 7757, 378.6, 7593.9, 1426.4, 7720, 860.8, 3715.3, 10035.4, 27194.4, 47627.4, 7433.9, 60634.4, 6222.5, 6850.3, 39567.9, 590.2, 30262.2, 567.8, 36317.8, 1555, 49541.3, 4543.3, 1461.6, 550, 422.8, 585.6, 21682.6, 8299.1, 3703, 37896.8, 2346.7, 13507.4, 13902.7, 3514.6, 53313.6, 6432.8, 52111, 34960.3, 36194.4, 1269.1, 1084.4, 1604.4, 15209.9, 27970.5, 9127.3, 1707.5, 10139.2, 6575.4, 7437, 10125.6, 944.4, 648.1, 3631, 2032.8, 4301.1, 995.5, 16037.8, 3140, 2233.8, 449.4, 8624.8, 8518.7, 10361.3, 4731.6, 5370.7, 765.7, 1197.5, 4333.3, 7370.9, 4170.2, 1270.2, 10005.6, 253, 54198.7, 440.7, 3184.6, 1913.6, 97363.1, 698.3, 38400.1, 4749, 1333.5, 11770.9, 6594.4, 2843.1, 11879.7, 14422.8, 22080.9, 4479.1, 3575.2, 93397.1, 9996.7, 12735.9, 652.1, 1541.1, 25409, 1904.2, 1070.9, 2021.7, 3950.7, 6152.9, 1781.1, 1113.4, 1692.4, 18416.5, 23962.6, 58887.3, 2682.3, 15359.2, 1053.8, 646.1, 9031.5, 1280.4, 4106.4, 998.1, 677.4, 3082.5, 7986.9, 16810.9, 6477.9, 475.2, 1801.9 )
urb_pop <- c( 26.3, 43.3, 56.4, 57.6, 62.8, 24.2, 65.9, 54.4, 11.8, 97.8, 43.5, 29, 33.5, 73.6, 82.8, 39.6, 76.3, 85.4, 31.6, 76.9, 57.2, 39.8, 54.4, 53.8, 76.2, 28.2, 64.8, 75.9, 67, 75.1, 69.3, 87.5, 51.9, 59.9, 75.7, 22.2, 79.4, 19, 74.6, 40.7, 84.1, 53.4, 53.4, 36.7, 59, 48.5, 77.7, 35.6, 51.1, 80.6, 54.1, 58.7, 70.8, 53, 63, 69.4, 94, 68.8, 93, 35.6, 20.5, 44.2, 32, 82.4, 77.7, 37.6, 87.7, 78.4, 18.5, 79.5, 30.9, 29.6, 18.3, 38.6, 47, 26.8, 67.4, 59.7, 44.9, 34.5, 44.5, 64.1, 79, 49.1, 57, 39.1, 33.6, 60.4, 63.8, 71.2, 59.3, 39.8, 16.1, 81.5, 18.5, 46.9, 58.5, 80.2, 18.2, 80, 48.6, 38.3, 66.3, 78.3, 44.5, 86.5, 60.6, 62.9, 59.4, 37.2, 99.2, 54.4, 73.9, 27.8, 32.6, 82.9, 33.6, 43.4, 21.9, 66.3, 55.5, 37.2, 18.6, 64.5, 53.8, 49.7, 85.7, 21.3, 53.6, 22.3, 39.5, 49.7, 32.1, 23.6, 30.9, 15.8, 69.5, 61.8, 95.2, 64.3, 42, 40.5 )
world_bank_train <- data.frame(urb_pop=urb_pop, cgdp=cgdp)
str(world_bank_train)
## 'data.frame': 142 obs. of 2 variables:
## $ urb_pop: num 26.3 43.3 56.4 57.6 62.8 24.2 65.9 54.4 11.8 97.8 ...
## $ cgdp : num 666 5936 4619 7574 3647 ...
cgdp_afg <- data.frame(cgdp=413)
# Plot urb_pop as function of cgdp
with(world_bank_train, plot(y=urb_pop, x=cgdp))
# Set up a linear model between the two variables: lm_wb
lm_wb <- lm(urb_pop ~ cgdp, data=world_bank_train)
# Add a red regression line to your scatter plot
abline(lm_wb$coefficients, col="red")
# Summarize lm_wb and select R-squared
summary(lm_wb)$r.squared
## [1] 0.3822347
# Predict the urban population of afghanistan based on cgdp_afg
predict(lm_wb, newdata=cgdp_afg)
## 1
## 45.0156
# Plot: change the formula and xlab
plot(urb_pop ~ log(cgdp), data = world_bank_train,
xlab = "log(GDP per Capita)",
ylab = "Percentage of urban population")
# Linear model: change the formula
lm_wb <- lm(urb_pop ~ log(cgdp), data = world_bank_train)
# Add a red regression line to your scatter plot
abline(lm_wb$coefficients, col = "red")
# Summarize lm_wb and select R-squared
summary(lm_wb)$r.squared
## [1] 0.5788284
# Predict the urban population of afghanistan based on cgdp_afg
predict(lm_wb, newdata=cgdp_afg)
## 1
## 25.86829
sales <- c( 231, 156, 10, 519, 437, 487, 299, 195, 20, 68, 570, 428, 464, 15, 65, 98, 398, 161, 397, 497, 528, 99, 0.5, 347, 341, 507, 400 )
sq_ft <- c( 3, 2.2, 0.5, 5.5, 4.4, 4.8, 3.1, 2.5, 1.2, 0.6, 5.4, 4.2, 4.7, 0.6, 1.2, 1.6, 4.3, 2.6, 3.8, 5.3, 5.6, 0.8, 1.1, 3.6, 3.5, 5.1, 8.6 )
inv <- c( 294, 232, 149, 600, 567, 571, 512, 347, 212, 102, 788, 577, 535, 163, 168, 151, 342, 196, 453, 518, 615, 278, 142, 461, 382, 590, 517 )
ads <- c( 8.2, 6.9, 3, 12, 10.6, 11.8, 8.1, 7.7, 3.3, 4.9, 17.4, 10.5, 11.3, 2.5, 4.7, 4.6, 5.5, 7.2, 10.4, 11.5, 12.3, 2.8, 3.1, 9.6, 9.8, 12, 7 )
size_dist <- c( 8.2, 4.1, 4.3, 16.1, 14.1, 12.7, 10.1, 8.4, 2.1, 4.7, 12.3, 14, 15, 2.5, 3.3, 2.7, 16, 6.3, 13.9, 16.3, 16, 6.5, 1.6, 11.3, 11.5, 15.7, 12 )
comp <- c( 11, 12, 15, 1, 5, 4, 10, 12, 15, 8, 1, 7, 3, 14, 11, 10, 4, 13, 7, 1, 0, 14, 12, 6, 5, 0, 8 )
shop_data <- data.frame(sales=sales, sq_ft=sq_ft, inv=inv, ads=ads,
size_dist=size_dist, comp=comp
)
str(shop_data)
## 'data.frame': 27 obs. of 6 variables:
## $ sales : num 231 156 10 519 437 487 299 195 20 68 ...
## $ sq_ft : num 3 2.2 0.5 5.5 4.4 4.8 3.1 2.5 1.2 0.6 ...
## $ inv : num 294 232 149 600 567 571 512 347 212 102 ...
## $ ads : num 8.2 6.9 3 12 10.6 11.8 8.1 7.7 3.3 4.9 ...
## $ size_dist: num 8.2 4.1 4.3 16.1 14.1 12.7 10.1 8.4 2.1 4.7 ...
## $ comp : num 11 12 15 1 5 4 10 12 15 8 ...
shop_new <- data.frame(sq_ft=2.3, inv=420, ads=8.7, size_dist=9.1, comp=10)
# Add a plot: sales as a function of inventory. Is linearity plausible?
plot(sales ~ sq_ft, shop_data)
plot(sales ~ size_dist, shop_data)
plot(sales ~ inv, shop_data)
# Build a linear model for net sales based on all other variables: lm_shop
lm_shop <- lm(sales ~ ., data=shop_data)
# Summarize lm_shop
summary(lm_shop)
##
## Call:
## lm(formula = sales ~ ., data = shop_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.338 -9.699 -4.496 4.040 41.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.85941 30.15023 -0.626 0.538372
## sq_ft 16.20157 3.54444 4.571 0.000166 ***
## inv 0.17464 0.05761 3.032 0.006347 **
## ads 11.52627 2.53210 4.552 0.000174 ***
## size_dist 13.58031 1.77046 7.671 1.61e-07 ***
## comp -5.31097 1.70543 -3.114 0.005249 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 21 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.9916
## F-statistic: 611.6 on 5 and 21 DF, p-value: < 2.2e-16
# Plot the residuals in function of your fitted observations
plot(x=lm_shop$fitted.values, y=lm_shop$residuals)
# Make a Q-Q plot of your residual quantiles
qqnorm(lm_shop$residuals, ylab="Residual Quantiles")
# Summarize your model, are there any irrelevant predictors?
summary(lm_shop)
##
## Call:
## lm(formula = sales ~ ., data = shop_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.338 -9.699 -4.496 4.040 41.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.85941 30.15023 -0.626 0.538372
## sq_ft 16.20157 3.54444 4.571 0.000166 ***
## inv 0.17464 0.05761 3.032 0.006347 **
## ads 11.52627 2.53210 4.552 0.000174 ***
## size_dist 13.58031 1.77046 7.671 1.61e-07 ***
## comp -5.31097 1.70543 -3.114 0.005249 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 21 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.9916
## F-statistic: 611.6 on 5 and 21 DF, p-value: < 2.2e-16
# Predict the net sales based on shop_new.
predict(lm_shop, newdata=shop_new)
## 1
## 262.5006
choco_data <- data.frame(
energy=c( 1970, 2003, 2057, 1920, 2250, 2186, 1930, 1980, 1890, 2030, 2180, 1623, 1640, 2210, 1980, 1970, 1877.4, 2021.4, 1840.1, 2272.1, 2047.3, 1843, 2075.2, 2119.8, 2090.9, 1934.3, 2257.3, 2057.9, 1878.2, 1595.3, 2188.3, 1980.4, 1985.9, 2156.5, 2134.6, 2094.2, 2151.7, 2127.7, 2001.9, 1635.2, 2098.9, 1978.6, 1961.2, 1727.2, 1903.7, 2062.6, 2230.1, 1970.5, 2057.4, 1979.2, 1744.1, 1914.9, 1918.7, 1978.1, 2184, 2124.4 ),
protein=c( 3.1, 4.6, 9.9, 5.1, 10.2, 7, 3.5, 7.2, 4.7, 5.6, 5.5, 2.2, 3.7, 8.2, 8.5, 5, 6.1, 4.6, 3.4, 10.5, 5.9, 3.2, 5.6, 7.5, 7.3, 5.4, 8.9, 6, 2.8, 3.4, 5.5, 7, 7.7, 8.9, 9.4, 7.5, 10.4, 5.6, 9.1, 2.9, 9.1, 4.7, 2.2, 2.3, 6.3, 6.7, 8.3, 6.3, 5.3, 7.8, 5.8, 7, 4.3, 6.9, 8.9, 5 ),
fat=c( 27.2, 26.5, 23, 18.4, 30.1, 28.4, 24.5, 22.9, 19.5, 20.4, 26.8, 9.2, 12, 29.8, 20.6, 20, 18, 22.3, 20.8, 27.7, 25.7, 18.3, 27.6, 25.8, 26.9, 21.6, 29.4, 27.8, 21.4, 12.9, 32.1, 24.4, 19.6, 26.6, 24.5, 24.6, 27.2, 26.1, 21.8, 12.2, 25, 26.7, 22, 16.5, 21.5, 29.6, 28.1, 20.8, 28.1, 21.2, 15.4, 19.9, 18.9, 21.9, 30.5, 25.1 ),
size=c( 50, 50, 40, 80, 45, 78, 55, 60, 60, 50, 40, 55, 44.5, 75, 60, 42.5, 52.3, 52.3, 63.1, 64.8, 46.9, 45, 60.7, 66.3, 54.7, 66.2, 62.6, 48, 58.8, 37.5, 75.4, 80.8, 50.6, 43.3, 63.9, 54.4, 87.6, 55.9, 64.3, 52.8, 46.7, 57.7, 31.8, 72, 56.6, 83.9, 63.4, 46, 63.7, 43.2, 37.2, 58.5, 49, 55.2, 57.9, 48.8 )
)
str(choco_data)
## 'data.frame': 56 obs. of 4 variables:
## $ energy : num 1970 2003 2057 1920 2250 ...
## $ protein: num 3.1 4.6 9.9 5.1 10.2 7 3.5 7.2 4.7 5.6 ...
## $ fat : num 27.2 26.5 23 18.4 30.1 28.4 24.5 22.9 19.5 20.4 ...
## $ size : num 50 50 40 80 45 78 55 60 60 50 ...
# Add a plot: energy/100g as function of total size. Linearity plausible?
plot(energy ~ protein, choco_data)
plot(energy ~ fat, choco_data)
plot(energy ~ size, choco_data)
# Build a linear model for the energy based on all other variables: lm_choco
lm_choco <- lm(energy ~ ., data=choco_data)
# Plot the residuals in function of your fitted observations
plot(x=lm_choco$fitted.values, y=lm_choco$residuals)
# Make a Q-Q plot of your residual quantiles
qqnorm(lm_choco$residuals)
# Summarize lm_choco
summary(lm_choco)
##
## Call:
## lm(formula = energy ~ ., data = choco_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.680 -36.071 -9.062 36.079 104.361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1339.2806 40.0195 33.466 < 2e-16 ***
## protein 23.0122 3.6565 6.293 6.6e-08 ***
## fat 24.4416 1.6839 14.515 < 2e-16 ***
## size -0.8224 0.6026 -1.365 0.178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.14 on 52 degrees of freedom
## Multiple R-squared: 0.9021, Adjusted R-squared: 0.8965
## F-statistic: 159.8 on 3 and 52 DF, p-value: < 2.2e-16
world_bank_test <- data.frame(
cgdp=c( 18389.4, 1099, 2379.2, 5823.3, 3670, 788.4, 1646.4, 19553.9, 1630.8, 61887, 2965.9, 3436.3, 12276.4, 3150.5, 42736.2, 16529.7, 10067.5, 25592.4, 50271.1, 5422.6, 6290.8, 20832, 10803.5, 935.9, 37031.7, 5292.9, 45603.3, 42522, 56286.8, 14520, 5361.1, 6662.6, 4017, 2037.7, 6075.5, 1784.4, 96443.7, 40169.6, 19719.8, 1796, 619, 10829.9, 16444.8, 14091.4, 54629.5, 5560.7, 43619.1, 19199.3, 832.9, 9463.1, 25198.1, 461, 5719.6, 3100.8, 10542.8, 12922.4, 1337.9, 51590, 914.7, 2052.3, 4173.4 ),
urb_pop=c( 39.8, 26.7, 37.9, 46.2, 53.5, 39.6, 53.5, 73, 32.4, 89.3, 75, 43.1, 53.3, 68.1, 79.3, 88.9, 86.9, 70.7, 81.7, 83.4, 63.5, 77.2, 53.5, 32.5, 92.1, 72.9, 82.3, 85.3, 100, 89.4, 70.1, 50.2, 28.5, 36.3, 78.1, 77.3, 100, 100, 67.6, 37.2, 31.9, 74, 66.5, 62.3, 81.4, 49.2, 80.7, 80.5, 57.4, 55.7, 88.7, 49.3, 45.7, 65, 72.9, 91.6, 25.2, 89.9, 34, 33, 19.3 )
)
str(world_bank_test)
## 'data.frame': 61 obs. of 2 variables:
## $ cgdp : num 18389 1099 2379 5823 3670 ...
## $ urb_pop: num 39.8 26.7 37.9 46.2 53.5 39.6 53.5 73 32.4 89.3 ...
# Build the log-linear model
lm_wb_log <- lm(urb_pop ~ log(cgdp), data = world_bank_train)
# Calculate rmse_train
rmse_train <- sqrt(mean(lm_wb_log$residuals ^ 2))
# The real percentage of urban population in the test set, the ground truth
world_bank_test_truth <- world_bank_test$urb_pop
# The predictions of the percentage of urban population in the test set
world_bank_test_input <- data.frame(cgdp = world_bank_test$cgdp)
world_bank_test_output <- predict(lm_wb_log, world_bank_test_input)
# The residuals: the difference between the ground truth and the predictions
res_test <- world_bank_test_output - world_bank_test_truth
# Use res_test to calculate rmse_test
rmse_test <- sqrt(mean(res_test^2))
# Print the ratio of the test RMSE over the training RMSE
rmse_test / rmse_train
## [1] 1.082428
my_knn <- function(x_pred, x, y, k){
m <- length(x_pred)
predict_knn <- rep(0, m)
for (i in 1:m) {
# Calculate the absolute distance between x_pred[i] and x
dist <- abs(x_pred[i] - x)
# Apply order() to dist, sort_index will contain
# the indices of elements in the dist vector, in
# ascending order. This means sort_index[1:k] will
# return the indices of the k-nearest neighbors.
sort_index <- order(dist)
# Apply mean() to the responses of the k-nearest neighbors
predict_knn[i] <- mean(y[sort_index[1:k]])
}
return(predict_knn)
}
# Apply your algorithm on the test set: test_output
test_output <- my_knn(x_pred=world_bank_test$cgdp, x=world_bank_train$cgdp,
y=world_bank_train$urb_pop, k=30
)
# Have a look at the plot of the output
plot(world_bank_train[,2:1],
xlab = "GDP per Capita",
ylab = "Percentage Urban Population")
points(world_bank_test$cgdp, test_output, col = "green")
# Set up a linear model between the two variables: lm_wb
lm_wb <- lm(urb_pop ~ cgdp, data=world_bank_train)
# Set up a linear model between the two variables: lm_wb
lm_wb_log <- lm(urb_pop ~ log(cgdp), data=world_bank_train)
# Define ranks to order the predictor variables in the test set
ranks <- order(world_bank_test$cgdp)
# Scatter plot of test set
plot(world_bank_test,
xlab = "GDP per Capita", ylab = "Percentage Urban Population")
# Predict with simple linear model and add line
test_output_lm <- predict(lm_wb, data.frame(cgdp = world_bank_test$cgdp))
lines(world_bank_test$cgdp[ranks], test_output_lm[ranks], lwd = 2, col = "blue")
# Predict with log-linear model and add line
test_output_lm_log <- predict(lm_wb_log, data.frame(cgdp = world_bank_test$cgdp))
lines(world_bank_test$cgdp[ranks], test_output_lm_log[ranks], lwd = 2, col = "red")
# Predict with k-NN and add line
test_output_knn <- my_knn(x_pred=world_bank_test$cgdp, x=world_bank_train$cgdp,
y=world_bank_train$urb_pop, k=30
)
lines(world_bank_test$cgdp[ranks], test_output_knn[ranks], lwd = 2, col = "green")
# Calculate RMSE on the test set for simple linear model
sqrt(mean( (test_output_lm - world_bank_test$urb_pop) ^ 2))
## [1] 17.41258
# Calculate RMSE on the test set for log-linear model
sqrt(mean( (test_output_lm_log - world_bank_test$urb_pop) ^ 2))
## [1] 15.01008
# Calculate RMSE on the test set for k-NN technique
sqrt(mean( (test_output_knn - world_bank_test$urb_pop) ^ 2))
## [1] 16.0917
Chapter 5 - Clustering
Clustering with k-means (unsupervised learning) - objects that are similar within and dissimilar across:
Performance and scaling issues - since there is no “truth”, the goal is to have compact clusters with low variance within the clusters and high separation between the clusters:
Hierarchical Clustering - addressing questions such as “which objects cluster first” and “which cluster pairs merge, and when”:
Example code includes:
seeds <- data.frame(area=c( 15.26, 14.88, 14.29, 13.84, 16.14, 14.38, 14.69, 14.11, 16.63, 16.44, 15.26, 14.03, 13.89, 13.78, 13.74, 14.59, 13.99, 15.69, 14.7, 12.72, 14.16, 14.11, 15.88, 12.08, 15.01, 16.19, 13.02, 12.74, 14.11, 13.45, 13.16, 15.49, 14.09, 13.94, 15.05, 16.12, 16.2, 17.08, 14.8, 14.28, 13.54, 13.5, 13.16, 15.5, 15.11, 13.8, 15.36, 14.99, 14.79, 14.86, 14.43, 15.78, 14.49, 14.33, 14.52, 15.03, 14.46, 14.92, 15.38, 12.11, 11.42, 11.23, 12.36, 13.22, 12.78, 12.88, 14.34, 14.01, 14.37, 12.73, 17.63, 16.84, 17.26, 19.11, 16.82, 16.77, 17.32, 20.71, 18.94, 17.12, 16.53, 18.72, 20.2, 19.57, 19.51, 18.27, 18.88, 18.98, 21.18, 20.88, 20.1, 18.76, 18.81, 18.59, 18.36, 16.87, 19.31, 18.98, 18.17, 18.72, 16.41, 17.99, 19.46, 19.18, 18.95, 18.83, 18.85, 17.63, 19.94, 18.55, 18.45, 19.38, 19.13, 19.14, 20.97, 19.06, 18.96, 19.15, 18.89, 20.03, 20.24, 18.14, 16.17, 18.43, 15.99, 18.75, 18.65, 17.98, 20.16, 17.55, 18.3, 18.94, 15.38, 16.16, 15.56, 15.38, 17.36, 15.57, 15.6, 16.23, 13.07, 13.32, 13.34, 12.22, 11.82, 11.21, 11.43, 12.49, 12.7, 10.79, 11.83, 12.01, 12.26, 11.18, 11.36, 11.19, 11.34, 12.13, 11.75, 11.49, 12.54, 12.02, 12.05, 12.55, 11.14, 12.1, 12.44, 12.15, 11.35, 11.24, 11.02, 11.55, 11.27, 11.4, 10.83, 10.8, 11.26, 10.74, 11.48, 12.21, 11.41, 12.46, 12.19, 11.65, 12.89, 11.56, 11.81, 10.91, 11.23, 10.59, 10.93, 11.27, 11.87, 10.82, 12.11, 12.8, 12.79, 13.37, 12.62, 12.76, 12.38, 12.67, 11.18, 12.7, 12.37, 12.19, 11.23, 13.2, 11.84, 12.3 ))
seeds$perimeter <- c( 14.84, 14.57, 14.09, 13.94, 14.99, 14.21, 14.49, 14.1, 15.46, 15.25, 14.85, 14.16, 14.02, 14.06, 14.05, 14.28, 13.83, 14.75, 14.21, 13.57, 14.4, 14.26, 14.9, 13.23, 14.76, 15.16, 13.76, 13.67, 14.18, 14.02, 13.82, 14.94, 14.41, 14.17, 14.68, 15, 15.27, 15.38, 14.52, 14.17, 13.85, 13.85, 13.55, 14.86, 14.54, 14.04, 14.76, 14.56, 14.52, 14.67, 14.4, 14.91, 14.61, 14.28, 14.6, 14.77, 14.35, 14.43, 14.77, 13.47, 12.86, 12.63, 13.19, 13.84, 13.57, 13.5, 14.37, 14.29, 14.39, 13.75, 15.98, 15.67, 15.73, 16.26, 15.51, 15.62, 15.91, 17.23, 16.49, 15.55, 15.34, 16.19, 16.89, 16.74, 16.71, 16.09, 16.26, 16.66, 17.21, 17.05, 16.99, 16.2, 16.29, 16.05, 16.52, 15.65, 16.59, 16.57, 16.26, 16.34, 15.25, 15.86, 16.5, 16.63, 16.42, 16.29, 16.17, 15.86, 16.92, 16.22, 16.12, 16.72, 16.31, 16.61, 17.25, 16.45, 16.2, 16.45, 16.23, 16.9, 16.91, 16.12, 15.38, 15.97, 14.89, 16.18, 16.41, 15.85, 17.03, 15.66, 15.89, 16.32, 14.9, 15.33, 14.89, 14.66, 15.76, 15.15, 15.11, 15.18, 13.92, 13.94, 13.95, 13.32, 13.4, 13.13, 13.13, 13.46, 13.71, 12.93, 13.23, 13.52, 13.6, 13.04, 13.05, 13.05, 12.87, 13.73, 13.52, 13.22, 13.67, 13.33, 13.41, 13.57, 12.79, 13.15, 13.59, 13.45, 13.12, 13, 13, 13.1, 12.97, 13.08, 12.96, 12.57, 13.01, 12.73, 13.05, 13.47, 12.95, 13.41, 13.36, 13.07, 13.77, 13.31, 13.45, 12.8, 12.82, 12.41, 12.8, 12.86, 13.02, 12.83, 13.27, 13.47, 13.53, 13.78, 13.67, 13.38, 13.44, 13.32, 12.72, 13.41, 13.47, 13.2, 12.88, 13.66, 13.21, 13.34 )
seeds$compactness <- c( 0.87, 0.88, 0.9, 0.9, 0.9, 0.9, 0.88, 0.89, 0.87, 0.89, 0.87, 0.88, 0.89, 0.88, 0.87, 0.9, 0.92, 0.91, 0.92, 0.87, 0.86, 0.87, 0.9, 0.87, 0.87, 0.88, 0.86, 0.86, 0.88, 0.86, 0.87, 0.87, 0.85, 0.87, 0.88, 0.9, 0.87, 0.91, 0.88, 0.89, 0.89, 0.89, 0.9, 0.88, 0.9, 0.88, 0.89, 0.89, 0.88, 0.87, 0.88, 0.89, 0.85, 0.88, 0.86, 0.87, 0.88, 0.9, 0.89, 0.84, 0.87, 0.88, 0.89, 0.87, 0.87, 0.89, 0.87, 0.86, 0.87, 0.85, 0.87, 0.86, 0.88, 0.91, 0.88, 0.86, 0.86, 0.88, 0.88, 0.89, 0.88, 0.9, 0.89, 0.88, 0.88, 0.89, 0.9, 0.86, 0.9, 0.9, 0.87, 0.9, 0.89, 0.91, 0.85, 0.86, 0.88, 0.87, 0.86, 0.88, 0.89, 0.9, 0.9, 0.87, 0.88, 0.89, 0.91, 0.88, 0.88, 0.89, 0.89, 0.87, 0.9, 0.87, 0.89, 0.89, 0.91, 0.89, 0.9, 0.88, 0.89, 0.88, 0.86, 0.91, 0.91, 0.9, 0.87, 0.9, 0.87, 0.9, 0.91, 0.89, 0.87, 0.86, 0.88, 0.9, 0.88, 0.85, 0.86, 0.88, 0.85, 0.86, 0.86, 0.87, 0.83, 0.82, 0.83, 0.87, 0.85, 0.81, 0.85, 0.82, 0.83, 0.83, 0.84, 0.83, 0.86, 0.81, 0.81, 0.83, 0.84, 0.85, 0.84, 0.86, 0.86, 0.88, 0.85, 0.84, 0.83, 0.84, 0.82, 0.85, 0.84, 0.84, 0.81, 0.86, 0.84, 0.83, 0.85, 0.85, 0.86, 0.87, 0.86, 0.86, 0.85, 0.82, 0.82, 0.84, 0.86, 0.86, 0.84, 0.86, 0.88, 0.83, 0.86, 0.89, 0.88, 0.88, 0.85, 0.9, 0.86, 0.9, 0.87, 0.89, 0.86, 0.88, 0.85, 0.89, 0.85, 0.87 )
seeds$length <- c( 5.76, 5.55, 5.29, 5.32, 5.66, 5.39, 5.56, 5.42, 6.05, 5.88, 5.71, 5.44, 5.44, 5.48, 5.48, 5.35, 5.12, 5.53, 5.21, 5.23, 5.66, 5.52, 5.62, 5.1, 5.79, 5.83, 5.39, 5.39, 5.54, 5.52, 5.45, 5.76, 5.72, 5.58, 5.71, 5.71, 5.83, 5.83, 5.66, 5.4, 5.35, 5.35, 5.14, 5.88, 5.58, 5.38, 5.7, 5.57, 5.54, 5.68, 5.58, 5.67, 5.71, 5.5, 5.74, 5.7, 5.39, 5.38, 5.66, 5.16, 5.01, 4.9, 5.08, 5.39, 5.26, 5.14, 5.63, 5.61, 5.57, 5.41, 6.19, 6, 5.98, 6.15, 6.02, 5.93, 6.06, 6.58, 6.45, 5.85, 5.88, 6.01, 6.29, 6.38, 6.37, 6.17, 6.08, 6.55, 6.57, 6.45, 6.58, 6.17, 6.27, 6.04, 6.67, 6.14, 6.34, 6.45, 6.27, 6.22, 5.72, 5.89, 6.11, 6.37, 6.25, 6.04, 6.15, 6.03, 6.67, 6.15, 6.11, 6.3, 6.18, 6.26, 6.56, 6.42, 6.05, 6.25, 6.23, 6.49, 6.32, 6.06, 5.76, 5.98, 5.36, 6.11, 6.29, 5.98, 6.51, 5.79, 5.98, 6.14, 5.88, 5.84, 5.78, 5.48, 6.14, 5.92, 5.83, 5.87, 5.47, 5.54, 5.39, 5.22, 5.31, 5.28, 5.18, 5.27, 5.39, 5.32, 5.26, 5.41, 5.41, 5.22, 5.17, 5.25, 5.05, 5.39, 5.44, 5.3, 5.45, 5.35, 5.27, 5.33, 5.01, 5.11, 5.32, 5.42, 5.18, 5.09, 5.33, 5.17, 5.09, 5.14, 5.28, 4.98, 5.19, 5.14, 5.18, 5.36, 5.09, 5.24, 5.24, 5.11, 5.5, 5.36, 5.41, 5.09, 5.09, 4.9, 5.05, 5.09, 5.13, 5.18, 5.24, 5.16, 5.22, 5.32, 5.41, 5.07, 5.22, 4.98, 5.01, 5.18, 5.2, 5.14, 5.14, 5.24, 5.17, 5.24 )
seeds$width <- c( 3.31, 3.33, 3.34, 3.38, 3.56, 3.31, 3.26, 3.3, 3.46, 3.5, 3.24, 3.2, 3.2, 3.16, 3.11, 3.33, 3.38, 3.51, 3.47, 3.05, 3.13, 3.17, 3.51, 2.94, 3.25, 3.42, 3.03, 2.96, 3.22, 3.06, 2.98, 3.37, 3.19, 3.15, 3.33, 3.48, 3.46, 3.68, 3.29, 3.3, 3.16, 3.16, 3.2, 3.4, 3.46, 3.15, 3.39, 3.38, 3.29, 3.26, 3.27, 3.43, 3.11, 3.2, 3.11, 3.21, 3.38, 3.41, 3.42, 3.03, 2.85, 2.88, 3.04, 3.07, 3.03, 3.12, 3.19, 3.16, 3.15, 2.88, 3.56, 3.48, 3.59, 3.93, 3.49, 3.44, 3.4, 3.81, 3.64, 3.57, 3.47, 3.86, 3.86, 3.77, 3.8, 3.65, 3.76, 3.67, 4.03, 4.03, 3.79, 3.8, 3.69, 3.86, 3.48, 3.46, 3.81, 3.55, 3.51, 3.68, 3.52, 3.69, 3.89, 3.68, 3.75, 3.79, 3.81, 3.57, 3.76, 3.67, 3.77, 3.79, 3.9, 3.74, 3.99, 3.72, 3.9, 3.82, 3.77, 3.86, 3.96, 3.56, 3.39, 3.77, 3.58, 3.87, 3.59, 3.69, 3.77, 3.69, 3.75, 3.83, 3.27, 3.4, 3.41, 3.46, 3.57, 3.23, 3.29, 3.47, 2.99, 3.07, 3.07, 2.97, 2.78, 2.69, 2.72, 2.97, 2.91, 2.65, 2.84, 2.78, 2.83, 2.69, 2.75, 2.67, 2.85, 2.75, 2.68, 2.69, 2.88, 2.81, 2.85, 2.97, 2.79, 2.94, 2.9, 2.84, 2.67, 2.71, 2.7, 2.85, 2.76, 2.76, 2.64, 2.82, 2.71, 2.64, 2.76, 2.89, 2.77, 3.02, 2.91, 2.85, 3.03, 2.68, 2.72, 2.67, 2.82, 2.79, 2.72, 2.8, 2.95, 2.63, 2.98, 3.13, 3.05, 3.13, 2.91, 3.15, 2.99, 3.13, 2.81, 3.09, 2.96, 2.98, 2.8, 3.23, 2.84, 2.97 )
seeds$asymmetry <- c( 2.22, 1.02, 2.7, 2.26, 1.36, 2.46, 3.59, 2.7, 2.04, 1.97, 4.54, 1.72, 3.99, 3.14, 2.93, 4.18, 5.23, 1.6, 1.77, 4.1, 3.07, 2.69, 0.77, 1.42, 1.79, 0.9, 3.37, 2.5, 2.75, 3.53, 0.86, 3.41, 3.92, 2.12, 2.13, 2.27, 2.82, 2.96, 3.11, 6.68, 2.59, 2.25, 2.46, 4.71, 3.13, 1.56, 1.37, 2.96, 2.7, 2.13, 3.98, 5.59, 4.12, 3.33, 1.48, 1.93, 2.8, 1.14, 2, 1.5, 2.7, 2.27, 3.22, 4.16, 1.18, 2.35, 1.31, 2.22, 1.46, 3.53, 4.08, 4.67, 4.54, 2.94, 4, 4.92, 3.82, 4.45, 5.06, 2.86, 5.53, 5.32, 5.17, 1.47, 2.96, 2.44, 1.65, 3.69, 5.78, 5.02, 1.96, 3.12, 3.24, 6, 4.93, 3.7, 3.48, 2.14, 2.85, 2.19, 4.22, 2.07, 4.31, 3.36, 3.37, 2.55, 2.84, 3.75, 3.25, 1.74, 2.23, 3.68, 2.11, 6.68, 4.68, 2.25, 4.33, 3.08, 3.64, 3.06, 5.9, 3.62, 4.29, 2.98, 3.34, 4.19, 4.39, 2.26, 1.91, 5.37, 2.84, 2.91, 4.46, 4.27, 4.97, 3.6, 3.53, 2.64, 2.73, 3.77, 5.3, 7.04, 6, 5.47, 4.47, 6.17, 2.22, 4.42, 3.26, 5.46, 5.2, 6.99, 4.76, 3.33, 4.05, 5.81, 3.35, 4.83, 4.38, 5.39, 3.08, 4.27, 4.99, 4.42, 6.39, 2.2, 4.92, 3.64, 4.34, 3.52, 6.74, 6.71, 4.31, 5.59, 5.18, 4.77, 5.34, 4.7, 5.88, 1.66, 4.96, 4.99, 4.86, 5.21, 6.18, 4.06, 4.9, 4.18, 7.52, 4.97, 5.4, 3.98, 3.6, 4.85, 4.13, 4.87, 5.48, 4.67, 3.31, 2.83, 5.47, 2.3, 4.05, 8.46, 3.92, 3.63, 4.33, 8.31, 3.6, 5.64 )
seeds$groove_length <- c( 5.22, 4.96, 4.83, 4.8, 5.17, 4.96, 5.22, 5, 5.88, 5.53, 5.31, 5, 4.74, 4.87, 4.83, 4.78, 4.78, 5.05, 4.65, 4.91, 5.18, 5.22, 5.09, 4.96, 5, 5.31, 4.83, 4.87, 5.04, 5.1, 5.06, 5.23, 5.3, 5.01, 5.36, 5.44, 5.53, 5.48, 5.31, 5, 5.18, 5.18, 4.78, 5.53, 5.18, 4.96, 5.13, 5.17, 5.11, 5.35, 5.14, 5.14, 5.4, 5.22, 5.49, 5.44, 5.04, 5.09, 5.22, 4.52, 4.61, 4.7, 4.61, 5.09, 4.78, 4.61, 5.15, 5.13, 5.3, 5.07, 6.06, 5.88, 5.79, 6.08, 5.84, 5.8, 5.92, 6.45, 6.36, 5.75, 5.88, 5.88, 6.19, 6.27, 6.18, 6.2, 6.11, 6.5, 6.23, 6.32, 6.45, 6.05, 6.05, 5.88, 6.45, 5.97, 6.24, 6.45, 6.27, 6.1, 5.62, 5.84, 6.01, 6.23, 6.15, 5.88, 6.2, 5.93, 6.55, 5.89, 5.79, 5.96, 5.92, 6.05, 6.32, 6.16, 5.75, 6.18, 5.97, 6.32, 6.19, 6.01, 5.7, 5.91, 5.14, 5.99, 6.1, 5.92, 6.18, 5.66, 5.96, 5.95, 5.8, 5.8, 5.85, 5.44, 5.97, 5.88, 5.75, 5.92, 5.39, 5.44, 5.31, 5.22, 5.18, 5.28, 5.13, 5, 5.32, 5.19, 5.31, 5.27, 5.36, 5, 5.26, 5.22, 5, 5.22, 5.31, 5.31, 5.49, 5.31, 5.05, 5.18, 5.05, 5.06, 5.27, 5.34, 5.13, 5.09, 5.16, 4.96, 5, 5.09, 5.18, 5.06, 5.09, 4.96, 5, 5.18, 4.83, 5.15, 5.16, 5.13, 5.32, 5.18, 5.35, 4.96, 4.96, 4.79, 5.04, 5, 5.13, 5.09, 5.01, 4.91, 4.96, 5.09, 5.23, 4.83, 5.04, 4.75, 4.83, 5, 5, 4.87, 5, 5.06, 5.04, 5.06 )
str(seeds)
## 'data.frame': 210 obs. of 7 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.87 0.88 0.9 0.9 0.9 0.9 0.88 0.89 0.87 0.89 ...
## $ length : num 5.76 5.55 5.29 5.32 5.66 5.39 5.56 5.42 6.05 5.88 ...
## $ width : num 3.31 3.33 3.34 3.38 3.56 3.31 3.26 3.3 3.46 3.5 ...
## $ asymmetry : num 2.22 1.02 2.7 2.26 1.36 2.46 3.59 2.7 2.04 1.97 ...
## $ groove_length: num 5.22 4.96 4.83 4.8 5.17 4.96 5.22 5 5.88 5.53 ...
seeds_type <- c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 )
# Do k-means clustering with three clusters, repeat 20 times: seeds_km
seeds_km <- kmeans(seeds, centers=3, nstart=20)
# Print out seeds_km
seeds_km
## K-means clustering with 3 clusters of sizes 72, 77, 61
##
## Cluster means:
## area perimeter compactness length width asymmetry groove_length
## 1 14.64847 14.46042 0.8794444 5.563333 3.277639 2.649306 5.192778
## 2 11.96442 13.27481 0.8529870 5.229481 2.872857 4.759870 5.088442
## 3 18.72180 16.29738 0.8855738 6.209016 3.721967 3.603607 6.065902
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## [36] 1 1 3 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 2
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 3 3 3 3 3 3 3 1 1 1 1 3 1 1 1
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 207.4138 195.7171 184.0488
## (between_SS / total_SS = 78.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Compare clusters with actual seed types. Set k-means clusters as rows
table(seeds_km$cluster, seeds_type)
## seeds_type
## 1 2 3
## 1 60 10 2
## 2 9 0 68
## 3 1 60 0
# Plot the length as function of width. Color by cluster
plot(x=seeds$width, y=seeds$length, col=seeds_km$cluster)
# Apply kmeans to seeds twice: seeds_km_1 and seeds_km_2
seeds_km_1 <- kmeans(seeds, centers=5, nstart=1)
seeds_km_2 <- kmeans(seeds, centers=5, nstart=1)
# Return the ratio of the within cluster sum of squares
seeds_km_1$tot.withinss / seeds_km_2$tot.withinss
## [1] 1.062865
# Compare the resulting clusters
table(seeds_km_1$cluster, seeds_km_2$cluster)
##
## 1 2 3 4 5
## 1 0 14 0 27 0
## 2 0 0 48 0 6
## 3 21 29 0 0 0
## 4 21 0 0 0 0
## 5 0 0 0 19 25
school_result <- data.frame(reading.4=c( 2.7, 3.9, 4.8, 3.1, 3.4, 3.1, 4.6, 3.1, 3.8, 5.2, 3.9, 4.1,
5.7, 3, 2.9, 3.4, 4, 3, 4, 3, 3.6, 3.1, 3.2, 3, 3.8
),
arithmetic.4=c( 3.2, 3.8, 4.1, 3.5, 3.7, 3.4, 4.4, 3.3, 3.7, 4.9, 3.8, 4,
5.1, 3.2, 3.3, 3.3, 4.2, 3, 4.1, 3.2, 3.6, 3.2, 3.3, 3.4, 4
),
reading.6=c( 4.5, 5.9, 6.8, 4.3, 5.1, 4.1, 6.6, 4, 4.7, 8.2, 5.2, 5.6, 7,
4.5, 4.5, 4.4, 5.2, 4.6, 5.9, 4.4, 5.3, 4.6, 5.4, 4.2, 6.9
),
arithmetic.6=c( 4.8, 6.2, 5.5, 4.6, 5.6, 4.7, 6.1, 4.9, 4.9, 6.9, 5.4, 5.6,
6.3, 5, 5.1, 5, 5.4, 5, 5.8, 5.1, 5.4, 5, 5.3, 4.7, 6.7
)
)
# Explore the structure of your data
str(school_result)
## 'data.frame': 25 obs. of 4 variables:
## $ reading.4 : num 2.7 3.9 4.8 3.1 3.4 3.1 4.6 3.1 3.8 5.2 ...
## $ arithmetic.4: num 3.2 3.8 4.1 3.5 3.7 3.4 4.4 3.3 3.7 4.9 ...
## $ reading.6 : num 4.5 5.9 6.8 4.3 5.1 4.1 6.6 4 4.7 8.2 ...
## $ arithmetic.6: num 4.8 6.2 5.5 4.6 5.6 4.7 6.1 4.9 4.9 6.9 ...
# Initialise ratio_ss
ratio_ss <- rep(0, 7)
# Finish the for-loop.
for (k in 1:7) {
# Apply k-means to school_result: school_km
school_km <- kmeans(school_result, centers=k, nstart=20)
# Save the ratio between of WSS to TSS in kth element of ratio_ss
ratio_ss[k] <- school_km$tot.withinss / school_km$totss
}
# Make a scree plot with type "b" and xlab "k"
plot(ratio_ss, type="b", xlab="k")
run_record <- data.frame(X100m=c( 10.23, 9.93, 10.15, 10.14, 10.27, 10, 9.84, 10.1, 10.17, 10.29, 10.97, 10.32, 10.24, 10.29, 10.16, 10.21, 10.02, 10.06, 9.87, 10.11, 10.32, 10.08, 10.33, 10.2, 10.35, 10.2, 10.01, 10, 10.28, 10.34, 10.6, 10.41, 10.3, 10.13, 10.21, 10.64, 10.19, 10.11, 10.08, 10.4, 10.57, 10, 9.86, 10.21, 10.11, 10.78, 10.37, 10.17, 10.18, 10.16, 10.36, 10.23, 10.38, 9.78 )
)
run_record$X200m <- c( 20.37, 20.06, 20.45, 20.19, 20.3, 19.89, 20.17, 20.15, 20.42, 20.85, 22.46, 20.96, 20.61, 20.52, 20.65, 20.47, 20.16, 20.23, 19.94, 19.85, 21.09, 20.11, 20.73, 20.93, 20.54, 20.89, 19.72, 20.03, 20.43, 20.41, 21.23, 20.77, 20.92, 20.06, 20.4, 21.52, 20.19, 20.42, 20.17, 21.18, 21.43, 19.98, 20.12, 20.75, 20.23, 21.86, 21.14, 20.59, 20.43, 20.41, 20.81, 20.69, 21.04, 19.32
)
run_record$X400m <- c( 46.18, 44.38, 45.8, 45.02, 45.26, 44.29, 44.72, 45.92, 45.25, 45.84, 51.4, 46.42, 45.77, 45.89, 44.9, 45.49, 44.64, 44.33, 44.36, 45.57, 48.44, 45.43, 45.48, 46.37, 45.58, 46.59, 45.26, 44.78, 44.18, 45.37, 46.95, 47.9, 46.41, 44.69, 44.31, 48.63, 45.68, 46.09, 46.11, 46.77, 45.57, 44.62, 46.11, 45.77, 44.6, 49.98, 47.6, 44.96, 45.54, 44.99, 46.72, 46.05, 46.63, 43.18
)
run_record$X800m <- c( 106.2, 104.4, 106.2, 103.8, 107.4, 102, 105, 105.6, 106.2, 108, 116.4, 112.2, 105, 101.4, 108.6, 104.4, 103.2, 103.8, 102, 105, 109.2, 105.6, 105.6, 109.8, 105, 108, 103.8, 106.2, 102, 104.4, 109.2, 105.6, 107.4, 108, 106.8, 108, 103.8, 104.4, 102.6, 108, 108, 103.2, 105, 105.6, 102.6, 116.4, 110.4, 103.8, 105.6, 102.6, 107.4, 108.6, 106.8, 102.6
)
run_record$X1500m <- c( 220.8, 211.8, 214.8, 214.2, 222, 214.2, 211.8, 219, 216.6, 223.2, 254.4, 230.4, 214.8, 211.2, 223.8, 216.6, 208.8, 211.8, 209.4, 216.6, 224.4, 215.4, 217.8, 226.2, 213.6, 222, 213, 217.2, 206.4, 218.4, 226.2, 220.2, 225.6, 229.8, 217.8, 228, 213, 212.4, 217.2, 240, 229.2, 215.4, 210, 214.2, 212.4, 240.6, 231.6, 208.8, 216.6, 211.8, 226.2, 226.2, 215.4, 207.6
)
run_record$X5000m <- c( 799.8, 775.8, 795.6, 769.8, 878.4, 808.8, 793.8, 803.4, 805.2, 809.4, 1002, 825, 805.2, 805.2, 858.6, 796.2, 778.8, 774.6, 780.6, 808.8, 838.8, 807, 810, 852.6, 784.2, 819.6, 785.4, 793.2, 759.6, 830.4, 834, 818.4, 846.6, 849, 787.8, 851.4, 793.2, 792.6, 786.6, 883.2, 838.2, 797.4, 783, 795, 792, 976.8, 897.6, 782.4, 797.4, 787.8, 834.6, 855, 807, 778.2
)
run_record$X10000m <- c( 1659, 1651.8, 1663.2, 1612.2, 1829.4, 1687.8, 1656, 1685.4, 1690.2, 1672.8, 2122.8, 1728.6, 1668, 1674.6, 1825.8, 1651.2, 1642.8, 1641.6, 1638, 1687.2, 1760.4, 1681.8, 1728.6, 1779, 1666.8, 1723.2, 1636.8, 1654.8, 1587.6, 1710.6, 1707, 1726.2, 1770, 1790.4, 1628.4, 1777.2, 1646.4, 1662, 1652.4, 1881.6, 1742.4, 1673.4, 1632.6, 1660.2, 1674, 2082.6, 1879.2, 1634.4, 1675.8, 1674, 1752, 1780.2, 1699.8, 1633.8
)
run_record$marathon <- c( 7774.2, 7650.6, 7933.2, 7632, 8782.2, 7563, 7805.4, 7931.4, 7750.8, 7870.2, 10275.6, 7993.8, 7894.2, 7765.8, 8760, 7869, 7581.6, 7708.2, 7627.8, 7922.4, 7951.8, 7926, 7920, 8350.8, 7749, 8052.6, 7637.4, 7569.6, 7473, 7632, 7755.6, 8041.8, 8956.2, 8584.2, 7631.4, 8374.2, 7698.6, 7715.4, 7810.2, 8887.8, 8306.4, 7753.8, 7581.6, 7938, 7749.6, 9690, 8653.2, 7633.8, 7822.8, 7773.6, 8061, 8359.8, 7815, 7522.8
)
rownames(run_record) <- c( 'Argentina', 'Australia', 'Austria', 'Belgium', 'Bermuda', 'Brazil', 'Canada', 'Chile', 'China', 'Columbia', 'CookIslands', 'CostaRica', 'CzechRepublic', 'Denmark', 'DominicanRepub', 'Finland', 'France', 'Germany', 'GreatBritain', 'Greece', 'Guatemala', 'Hungary', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kenya', 'Korea,South', 'Korea,North', 'Luxembourg', 'Malaysia', 'Mauritius', 'Mexico', 'Myanmar(Burma)', 'Netherlands', 'NewZealand', 'Norway', 'PapuaNewGuinea', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Samoa', 'Singapore', 'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'U.S.A.'
)
# Explore your data with str() and summary()
str(run_record)
## 'data.frame': 54 obs. of 8 variables:
## $ X100m : num 10.23 9.93 10.15 10.14 10.27 ...
## $ X200m : num 20.4 20.1 20.4 20.2 20.3 ...
## $ X400m : num 46.2 44.4 45.8 45 45.3 ...
## $ X800m : num 106 104 106 104 107 ...
## $ X1500m : num 221 212 215 214 222 ...
## $ X5000m : num 800 776 796 770 878 ...
## $ X10000m : num 1659 1652 1663 1612 1829 ...
## $ marathon: num 7774 7651 7933 7632 8782 ...
summary(run_record)
## X100m X200m X400m X800m
## Min. : 9.78 Min. :19.32 Min. :43.18 Min. :101.4
## 1st Qu.:10.10 1st Qu.:20.17 1st Qu.:44.91 1st Qu.:103.8
## Median :10.20 Median :20.43 Median :45.58 Median :105.6
## Mean :10.22 Mean :20.54 Mean :45.83 Mean :106.1
## 3rd Qu.:10.32 3rd Qu.:20.84 3rd Qu.:46.32 3rd Qu.:108.0
## Max. :10.97 Max. :22.46 Max. :51.40 Max. :116.4
## X1500m X5000m X10000m marathon
## Min. :206.4 Min. : 759.6 Min. :1588 Min. : 7473
## 1st Qu.:213.0 1st Qu.: 788.9 1st Qu.:1653 1st Qu.: 7701
## Median :216.6 Median : 805.2 Median :1675 Median : 7819
## Mean :219.2 Mean : 817.1 Mean :1712 Mean : 8009
## 3rd Qu.:224.2 3rd Qu.: 834.5 3rd Qu.:1739 3rd Qu.: 8050
## Max. :254.4 Max. :1002.0 Max. :2123 Max. :10276
# Cluster run_record using k-means: run_km. 5 clusters, repeat 20 times
run_km <- kmeans(run_record, centers=5, nstart=20)
# Plot the 100m as function of the marathon. Color using clusters
plot(x=run_record$marathon, y=run_record$X100m, col=run_km$cluster)
# Calculate Dunn's index: dunn_km. Print it.
(dunn_km <- clValid::dunn(clusters=run_km$cluster, Data=run_record))
## [1] 0.05954843
# Standardize run_record, transform to a dataframe: run_record_sc
run_record_sc <- as.data.frame( scale(run_record) )
# Cluster run_record_sc using k-means: run_km_sc. 5 groups, let R start over 20 times
run_km_sc <- kmeans(run_record_sc, centers=5, nstart=20)
# Plot records on 100m as function of the marathon. Color using the clusters in run_km_sc
plot(x=run_record$marathon, y=run_record$X100m, col=run_km_sc$cluster,
xlab="Marathon", ylab="100 metres"
)
# Compare the resulting clusters in a nice table
table(run_km$cluster, run_km_sc$cluster)
##
## 1 2 3 4 5
## 1 0 0 2 2 0
## 2 0 0 0 6 0
## 3 3 15 8 0 0
## 4 0 0 0 0 2
## 5 11 5 0 0 0
# Calculate Dunn's index: dunn_km_sc. Print it.
(dunn_km_sc <- clValid::dunn(clusters=run_km_sc$cluster, Data=run_record_sc))
## [1] 0.1453556
# Apply dist() to run_record_sc: run_dist
run_dist <- dist(run_record_sc)
# Apply hclust() to run_dist: run_single
run_single <- hclust(run_dist, method="single")
# Apply cutree() to run_single: memb_single
memb_single <- cutree(run_single, k=5)
# Apply plot() on run_single to draw the dendrogram
plot(run_single)
# Apply rect.hclust() on run_single to draw the boxes
rect.hclust(run_single, k=5, border=2:6)
# Apply hclust() to run_dist: run_complete
run_complete <- hclust(run_dist, method="complete")
# Apply cutree() to run_complete: memb_complete
memb_complete <- cutree(run_complete, k=5)
# Apply plot() on run_complete to draw the dendrogram
plot(run_complete)
# Apply rect.hclust() on run_complete to draw the boxes
rect.hclust(run_complete, k=5, border=2:6)
# table() the clusters memb_single and memb_complete. Put memb_single in the rows
table(memb_single, memb_complete)
## memb_complete
## memb_single 1 2 3 4 5
## 1 27 7 14 0 1
## 2 0 0 0 1 0
## 3 0 0 0 0 1
## 4 0 0 0 0 2
## 5 0 0 0 1 0
# Dunn's index for k-means: dunn_km
dunn_km <- clValid::dunn(clusters=run_km_sc$cluster, Data=run_record_sc)
# Dunn's index for single-linkage: dunn_single
dunn_single <- clValid::dunn(clusters=memb_single, Data=run_record_sc)
# Dunn's index for complete-linkage: dunn_complete
dunn_complete <- clValid::dunn(clusters=memb_complete, Data=run_record_sc)
# Compare k-means with single-linkage
table(run_km_sc$cluster, memb_single)
## memb_single
## 1 2 3 4 5
## 1 14 0 0 0 0
## 2 20 0 0 0 0
## 3 9 0 1 0 0
## 4 6 0 0 2 0
## 5 0 1 0 0 1
# Compare k-means with complete-linkage
table(run_km_sc$cluster, memb_complete)
## memb_complete
## 1 2 3 4 5
## 1 7 7 0 0 0
## 2 20 0 0 0 0
## 3 0 0 8 0 2
## 4 0 0 6 0 2
## 5 0 0 0 2 0
crime_data <- data.frame(murder=c( 13.2, 10, 8.1, 8.8, 9, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2.6, 10.4, 7.2, 2.2, 6, 9.7, 15.4, 2.1, 11.3, 4.4, 12.1, 2.7, 16.1, 9, 6, 4.3, 12.2, 2.1, 7.4, 11.4, 11.1, 13, 0.8, 7.3, 6.6, 4.9, 6.3, 3.4, 14.4, 3.8, 13.2, 12.7, 3.2, 2.2, 8.5, 4, 5.7, 2.6, 6.8 )
)
crime_data$assault <- c( 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 249, 113, 56, 115, 109, 249, 83, 300, 149, 255, 72, 259, 178, 109, 102, 252, 57, 159, 285, 254, 337, 45, 120, 151, 159, 106, 174, 279, 86, 188, 201, 120, 48, 156, 145, 81, 53, 161
)
crime_data$urb_pop <- c( 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, 66, 52, 66, 51, 67, 85, 74, 66, 44, 70, 53, 62, 81, 56, 89, 70, 86, 45, 44, 75, 68, 67, 72, 87, 48, 45, 59, 80, 80, 32, 63, 73, 39, 66, 60
)
crime_data$rape <- c( 21.2, 44.5, 31, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, 20.2, 14.2, 24, 21, 11.3, 18, 16.3, 22.2, 7.8, 27.8, 16.3, 35.1, 14.9, 17.1, 28.2, 16.4, 16.5, 46, 9.5, 18.8, 32.1, 26.1, 16.1, 7.3, 21.4, 20, 29.3, 14.9, 8.3, 22.5, 12.8, 26.9, 25.5, 22.9, 11.2, 20.7, 26.2, 9.3, 10.8, 15.6
)
rownames(crime_data) <- c( 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
)
str(crime_data)
## 'data.frame': 50 obs. of 4 variables:
## $ murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ assault: num 236 263 294 190 276 204 110 238 335 211 ...
## $ urb_pop: num 58 48 80 50 91 78 77 72 80 60 ...
## $ rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
# Scale the dataset: crime_data_sc
crime_data_sc <- as.data.frame(scale(crime_data))
# Perform k-means clustering: crime_km
crime_km <- kmeans(crime_data_sc, centers=4, nstart=20)
# Perform single-linkage hierarchical clustering
## Calculate the distance matrix: dist_matrix
dist_matrix <- dist(crime_data_sc)
## Calculate the clusters using hclust(): crime_single
crime_single <- hclust(dist_matrix, method="single")
## Cut the clusters using cutree: memb_single
memb_single <- cutree(crime_single, k=4)
# Calculate the Dunn's index for both clusterings: dunn_km, dunn_single
dunn_km <- clValid::dunn(clusters=crime_km$cluster, Data=crime_data_sc)
dunn_single <- clValid::dunn(clusters=memb_single, Data=crime_data_sc)
# Print out the results
dunn_km
## [1] 0.1604403
dunn_single
## [1] 0.2438734
table(crime_km$cluster, memb_single)
## memb_single
## 1 2 3 4
## 1 9 1 2 1
## 2 8 0 0 0
## 3 16 0 0 0
## 4 13 0 0 0
Chapter 1 - Unsupervised Learning in R
Introduction to the main types of machine learning:
Introduction to k-means clustering - assume a number of sub-groups, then iteratively assign/update the clusters/centroids:
How kmeans works and practical matters:
Introduction to the Pokemon data - 800 Pokemon each with 6 features:
Example code includes:
x <- matrix(data=NA, nrow=300, ncol=2)
x[,1] <- c( 3.37, 1.44, 2.36, 2.63, 2.4, 1.89, 3.51, 1.91, 4.02, 1.94, 3.3, 4.29, 0.61, 1.72, 1.87, 2.64, 1.72, -0.66, -0.44, 3.32, 1.69, 0.22, 1.83, 3.21, 3.9, 1.57, 1.74, 0.24, 2.46, 1.36, 2.46, 2.7, 3.04, 1.39, 2.5, 0.28, 1.22, 1.15, -0.41, 2.04, 2.21, 1.64, 2.76, 1.27, 0.63, 2.43, 1.19, 3.44, 1.57, 2.66, 2.32, 1.22, 3.58, 2.64, 2.09, 2.28, 2.68, 2.09, -0.99, 2.28, 1.63, 2.19, 2.58, 3.4, 1.27, 3.3, 2.34, 3.04, 2.92, 2.72, 0.96, 1.91, 2.62, 1.05, 1.46, 2.58, 2.77, 2.46, 1.11, 0.9, 3.51, 2.26, 2.09, 1.88, 0.81, 2.61, 1.78, 1.82, 2.93, 2.82, 3.39, 1.52, 2.65, 3.39, 0.89, 1.14, 0.87, 0.54, 2.08, 2.65, -3.8, -3.96, -6, -3.15, -5.67, -4.89, -5.42, -5.12, -4.81, -4.88, -5.03, -4.89, -5.49, -5.5, -6.66, -5.38, -5.51, -2.3, -6.36, -4.86, -6.49, -6.47, -4.88, -6, -5, -5.43, -5.61, -7.02, -6.22, -4.82, -4.43, -5.49, -5, -3.88, -3.56, -6.1, -5.12, -3.8, -5.47, -5.05, -5.09, -5.89, -5.44, -5.03, -5.41, -3.89, -5.48, -5.43, -4.3, -6.06, -5.04, -6.55, -3.83, -5.27, -5.47, -6.24, -5.01, -5.8, -5.53, -3.71, -5.18, -6.07, -4.84, -5.36, -4.41, -3.57, -5.99, -4.55, -4.92, -4.1, -5.23, -4.16, -6.75, -3.31, -4.14, -5.15, -6.45, -4.36, -4.52, -5.01, -4.85, -5.58, -4.63, -4.71, -5.28, -6.34, -4.3, -4.45, -5.84, -6.59, -4.8, -5.35, -4.75, -6.29, -5.96, -3.91, -4.6, -4.41, -3.18, -4.87, -7, -4.67, -3.83, -2.94, -6.38, -6.15, -5.71, -6.05, -5.65, -5.19, -6.2, -2.96, -4.89, -5.08, -4.5, -4.96, -5.13, -3.52, -5.22, -6.28, -4.61, -5.35, -5.52, -6.07, -4.57, -5.17, -4.48, -5.23, -5.66, -3.75, -5.27, -4.05, -6.2, -5.47, -5.27, -5.39, -3.65, -5.02, -4.76, -5.94, -5.73, -4, -3.74, -3.75, -6.38, -2.95, -3.98, -5.03, -4.3, -5.97, -0.1, 1.05, -0.2, 1.19, 2.3, -0.03, 0.26, 1.05, -0.02, 0.62, 1.87, 1.97, 1.38, -0.85, 0.95, 2.06, 1.81, 0.81, -1.7, 1.06, 1.57, 1.05, 1.16, 1.43, 0.6, 2.31, 1.47, -0.24, 2.38, 2.2, 1.82, -0.66, 0.43, 1.64, 1.04, 1.35, 3.46, 0.18, -1.11, 1.27, 0.31, 1.45, 0.19, 3.21, 0.88, 0.52, 0.83, 1.86, 1.1, -0.63 )
x[,2] <- c( 2, 2.76, 2.04, 2.74, 1.85, 1.94, 2.48, 2.99, 0.75, 1.97, 1.93, 1.24, 0.97, 1.37, 2.59, 1.58, 1.22, 2.16, 0.76, 3.05, 1.52, 2.19, 2.05, 2, 3.81, 1.17, 3.15, 2.03, 1.16, 1.93, 2.75, 1.57, 1.23, 2.15, 2.99, 1.93, 0.61, 0.69, 1.23, 1.47, 1.98, 2.67, 1.57, 0.89, 2.61, 2.28, 3.16, 0.32, 2.09, 3.35, 2.72, 1.17, 2.73, 1.13, 1.55, 3.19, 1.71, 2.83, 1.71, 0.42, 1.15, 0.91, 1.52, 1.66, 1.85, 1.76, 3.89, 0.61, 1.59, 2.35, 3.63, 2.09, 3.24, 0.36, 3.45, 1.31, 1.72, 0.89, 2.13, 3.79, 4.42, 0.92, 2.49, 3.39, 1.8, 1.78, 1.7, 2.6, 3.4, 2.69, 2.32, 1.7, 2.5, 1.45, 1.72, 3.1, 2.44, 2.24, 1.74, 2.93, 3.33, 1.13, 2.06, 2.05, 1.42, 1, 2, 2.66, 3.48, 0.09, 1.3, 1.69, 0.34, 1.25, 1.22, 1.28, -0.19, 2.21, 1.37, 3.52, 2.8, 0.55, 2.1, 1.41, 2.89, 2.05, 1.44, 2.44, 2.15, 1.84, 4.02, 1.47, 1.53, 0.45, 1.96, 2.89, -0.07, 1.75, 0.82, 3.44, 3.36, 2.33, 3.43, 1.13, 2.95, 1.41, 2.32, 1.7, 1.72, 2.55, 0.7, 1.75, 2.17, 1.6, 2.1, 1.68, 3.62, 2.71, 4.97, 1.2, 2.81, 4.1, 2.3, 0.92, 0.99, 1.96, 3.31, 2.75, -0.14, 1.3, 1.99, 0.54, 2.69, -0.46, 2.14, 1.61, 1.51, 1.72, 2.31, 2.4, 1.77, 0.08, 0.56, 0.53, 2.76, 1.76, 2.27, 0.44, 1.46, 2.56, 1.82, 1.88, 1.93, 3.21, 1.39, 2.68, 2.9, 0.81, 2.12, 1.99, 3.03, 2.91, 2, 2.14, 1.28, 1.8, 0.97, 1.03, 0.78, 2.84, 3.11, 1.59, 0.87, 1.91, 4.24, 4.04, 0.28, 1.64, 3.53, 1.96, 3.6, 1.67, 2.6, 2.22, 5.23, 2.92, 0.79, 1.4, 2.37, 0.1, 0.2, 0.88, 1.65, 3.24, 1.73, 2.16, 1.94, 1.29, 3.36, 0.9, 1.77, 1.65, 2.53, 3.61, 2.51, 3.38, 2.76, 1.38, 2.08, 3.38, -1.56, 0.32, -0.16, 0.88, 0.75, 0.3, 1.49, -1.53, 0.91, -1.58, 0.59, 0.09, 0.97, 0.08, -1.57, -2.01, 0.54, -0.07, -0.57, -0.31, -0.67, -0.16, -0.93, -1.98, -0.22, 1.05, 1.88, 0, -0.08, 0.96, 0.05, -0.43, -1.74, -1.26, 0.41, -1.46, 1.05, -1.35, -0.19, 0, -0.01, 0.15, 0.6, -0.13, -0.25, 0.16, -0.43, 1.54, -2.17, 1.03 )
str(x)
## num [1:300, 1:2] 3.37 1.44 2.36 2.63 2.4 1.89 3.51 1.91 4.02 1.94 ...
# Create the k-means model: km.out
km.out <- kmeans(x, centers=3, nstart=20)
# Inspect the result
summary(km.out)
## Length Class Mode
## cluster 300 -none- numeric
## centers 6 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# Print the cluster membership component of the model
km.out$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 2 2 2 2 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1
## [281] 3 3 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 1 3 3
# Print the km.out object
km.out
## K-means clustering with 3 clusters of sizes 98, 150, 52
##
## Cluster means:
## [,1] [,2]
## 1 2.2170408 2.05153061
## 2 -5.0554667 1.96973333
## 3 0.6642308 -0.09115385
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 2 2 2 2 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1
## [281] 3 3 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 1 3 3
##
## Within cluster sum of squares by cluster:
## [1] 148.7013 295.1237 95.4708
## (between_SS / total_SS = 87.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Scatter plot of x
plot(x, col=km.out$cluster, main="k-means with 3 clusters", xlab="", ylab="")
# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))
for(i in 1:6) {
# Run kmeans() on x with three clusters and one start
km.out <- kmeans(x, centers=3, nstart=1)
# Plot clusters
plot(x, col = km.out$cluster,
main = km.out$tot.withinss,
xlab = "", ylab = "")
}
par(mfrow = c(1, 1))
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(x, centers = i, nstart=20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
pokemon <- matrix(data=NA, nrow=800, ncol=6)
v1 <- c(45, 60, 80, 80, 39, 58, 78, 78, 78, 44, 59, 79, 79, 45, 50, 60, 40, 45, 65, 65, 40, 63, 83, 83, 30, 55, 40, 65, 35, 60, 35, 60, 50, 75, 55, 70, 90, 46, 61, 81, 70, 95, 38, 73, 115, 140, 40, 75, 45, 60, 75, 35, 60, 60, 70, 10, 35, 40, 65, 50, 80, 40, 65, 55, 90, 40, 65, 90, 25, 40, 55, 55, 70, 80, 90, 50, 65, 80, 40, 80, 40, 55, 80, 50, 65, 90, 95, 95, 25, 50, 52, 35, 60, 65, 90, 80, 105, 30, 50, 30, 45, 60, 60, 35, 60, 85, 30, 55, 40, 60, 60, 95, 50, 60, 50, 50, 90, 40, 65, 80, 105, 250, 65, 105, 105, 30, 55, 45, 80, 30, 60, 40, 70, 65, 65, 65, 65, 65, 75, 20, 95, 95, 130, 48, 55, 130, 65, 65, 65, 35, 70, 30, 60, 80, 80, 160, 90, 90, 90, 41, 61, 91, 106, 106, 106, 100, 45, 60, 80, 39, 58, 78, 50, 65, 85, 35, 85, 60, 100, 40, 55, 40, 70, 85, 75, 125, 20, 50, 90, 35, 55, 40, 65, 55, 70, 90, 90, 75, 70, 100, 70, 90, 35, 55, 75, 55, 30, 75, 65, 55, 95, 65, 95, 60, 95, 60, 48, 190, 70, 50, 75, 100, 65, 75, 75, 60, 90, 65, 70, 70, 20, 80, 80, 55, 60, 90, 40, 50, 50, 100, 55, 35, 75, 45, 65, 65, 45, 75, 75, 75, 90, 90, 85, 73, 55, 35, 50, 45, 45, 45, 95, 255, 90, 115, 100, 50, 70, 100, 100, 106, 106, 100, 40, 50, 70, 70, 45, 60, 80, 80, 50, 70, 100, 100, 35, 70, 38, 78, 45, 50, 60, 50, 60, 40, 60, 80, 40, 70, 90, 40, 60, 40, 60, 28, 38, 68, 68, 40, 70, 60, 60, 60, 80, 150, 31, 61, 1, 64, 84, 104, 72, 144, 50, 30, 50, 70, 50, 50, 50, 50, 50, 60, 70, 70, 30, 60, 60, 40, 70, 70, 60, 60, 65, 65, 50, 70, 100, 45, 70, 70, 130, 170, 60, 70, 70, 70, 60, 80, 60, 45, 50, 80, 50, 70, 45, 75, 75, 73, 73, 70, 70, 50, 110, 43, 63, 40, 60, 66, 86, 45, 75, 20, 95, 70, 60, 44, 64, 64, 20, 40, 99, 65, 65, 65, 95, 50, 80, 80, 70, 90, 110, 35, 55, 55, 100, 43, 45, 65, 95, 95, 40, 60, 80, 80, 80, 80, 80, 80, 80, 80, 80, 100, 100, 100, 100, 105, 105, 100, 50, 50, 50, 50, 55, 75, 95, 44, 64, 76, 53, 64, 84, 40, 55, 85, 59, 79, 37, 77, 45, 60, 80, 40, 60, 67, 97, 30, 60, 40, 60, 60, 60, 70, 30, 70, 60, 55, 85, 45, 70, 76, 111, 75, 90, 150, 55, 65, 65, 60, 100, 49, 71, 45, 63, 103, 57, 67, 50, 20, 100, 76, 50, 58, 68, 108, 108, 135, 40, 70, 70, 68, 108, 40, 70, 48, 83, 74, 49, 69, 45, 60, 90, 90, 70, 70, 110, 115, 100, 75, 75, 85, 86, 65, 65, 75, 110, 85, 68, 68, 60, 45, 70, 50, 50, 50, 50, 50, 50, 75, 80, 75, 100, 90, 91, 110, 150, 150, 120, 80, 100, 70, 100, 100, 120, 100, 45, 60, 75, 65, 90, 110, 55, 75, 95, 45, 60, 45, 65, 85, 41, 64, 50, 75, 50, 75, 50, 75, 76, 116, 50, 62, 80, 45, 75, 55, 70, 85, 55, 67, 60, 110, 103, 103, 75, 85, 105, 50, 75, 105, 120, 75, 45, 55, 75, 30, 40, 60, 40, 60, 45, 70, 70, 50, 60, 95, 70, 105, 105, 75, 50, 70, 50, 65, 72, 38, 58, 54, 74, 55, 75, 50, 80, 40, 60, 55, 75, 45, 60, 70, 45, 65, 110, 62, 75, 36, 51, 71, 60, 80, 55, 50, 70, 69, 114, 55, 100, 165, 50, 70, 44, 74, 40, 60, 60)
pokemon[, 1] <- c( v1, 35, 65, 85, 55, 75, 50, 60, 60, 46, 66, 76, 55, 95, 70, 50, 80, 109, 45, 65, 77, 59, 89, 45, 65, 95, 70, 100, 70, 110, 85, 58, 52, 72, 92, 55, 85, 91, 91, 91, 79, 79, 79, 79, 100, 100, 89, 89, 125, 125, 125, 91, 91, 100, 100, 71, 56, 61, 88, 40, 59, 75, 41, 54, 72, 38, 85, 45, 62, 78, 38, 45, 80, 62, 86, 44, 54, 78, 66, 123, 67, 95, 75, 62, 74, 74, 45, 59, 60, 60, 78, 101, 62, 82, 53, 86, 42, 72, 50, 65, 50, 71, 44, 62, 58, 82, 77, 123, 95, 78, 67, 50, 45, 68, 90, 57, 43, 85, 49, 44, 54, 59, 65, 55, 75, 85, 55, 95, 40, 85, 126, 126, 108, 50, 50, 80, 80, 80 )
v2 <- c(49, 62, 82, 100, 52, 64, 84, 130, 104, 48, 63, 83, 103, 30, 20, 45, 35, 25, 90, 150, 45, 60, 80, 80, 56, 81, 60, 90, 60, 85, 55, 90, 75, 100, 47, 62, 92, 57, 72, 102, 45, 70, 41, 76, 45, 70, 45, 80, 50, 65, 80, 70, 95, 55, 65, 55, 80, 45, 70, 52, 82, 80, 105, 70, 110, 50, 65, 95, 20, 35, 50, 50, 80, 100, 130, 75, 90, 105, 40, 70, 80, 95, 120, 85, 100, 65, 75, 75, 35, 60, 65, 85, 110, 45, 70, 80, 105, 65, 95, 35, 50, 65, 65, 45, 48, 73, 105, 130, 30, 50, 40, 95, 50, 80, 120, 105, 55, 65, 90, 85, 130, 5, 55, 95, 125, 40, 65, 67, 92, 45, 75, 45, 110, 50, 83, 95, 125, 155, 100, 10, 125, 155, 85, 48, 55, 65, 65, 130, 60, 40, 60, 80, 115, 105, 135, 110, 85, 90, 100, 64, 84, 134, 110, 190, 150, 100, 49, 62, 82, 52, 64, 84, 65, 80, 105, 46, 76, 30, 50, 20, 35, 60, 90, 90, 38, 58, 40, 25, 30, 20, 40, 50, 75, 40, 55, 75, 95, 80, 20, 50, 100, 75, 35, 45, 55, 70, 30, 75, 65, 45, 85, 65, 65, 85, 75, 60, 72, 33, 80, 65, 90, 70, 75, 85, 125, 80, 120, 95, 130, 150, 10, 125, 185, 95, 80, 130, 40, 50, 50, 100, 55, 65, 105, 55, 40, 80, 60, 90, 90, 95, 60, 120, 80, 95, 20, 35, 95, 30, 63, 75, 80, 10, 85, 115, 75, 64, 84, 134, 164, 90, 130, 100, 45, 65, 85, 110, 60, 85, 120, 160, 70, 85, 110, 150, 55, 90, 30, 70, 45, 35, 70, 35, 50, 30, 50, 70, 40, 70, 100, 55, 85, 30, 50, 25, 35, 65, 85, 30, 60, 40, 130, 60, 80, 160, 45, 90, 90, 51, 71, 91, 60, 120, 20, 45, 45, 65, 75, 85, 85, 105, 70, 90, 110, 140, 40, 60, 100, 45, 75, 75, 50, 40, 73, 47, 60, 43, 73, 90, 120, 140, 70, 90, 60, 100, 120, 85, 25, 45, 60, 100, 70, 100, 85, 115, 40, 70, 110, 115, 100, 55, 95, 48, 78, 80, 120, 40, 70, 41, 81, 95, 125, 15, 60, 70, 90, 75, 115, 165, 40, 70, 68, 50, 130, 150, 23, 50, 80, 120, 40, 60, 80, 64, 104, 84, 90, 30, 75, 95, 135, 145, 55, 75, 135, 145, 100, 50, 75, 80, 100, 90, 130, 100, 150, 150, 180, 150, 180, 100, 150, 180, 70, 95, 68, 89, 109, 58, 78, 104, 51, 66, 86, 55, 75, 120, 45, 85, 25, 85, 65, 85, 120, 30, 70, 125, 165, 42, 52, 29, 59, 79, 69, 94, 30, 80, 45, 65, 105, 35, 60, 48, 83, 100, 50, 80, 66, 76, 136, 60, 125, 55, 82, 30, 63, 93, 24, 89, 80, 25, 5, 65, 92, 70, 90, 130, 170, 85, 70, 110, 145, 72, 112, 50, 90, 61, 106, 100, 49, 69, 20, 62, 92, 132, 120, 70, 85, 140, 100, 123, 95, 50, 76, 110, 60, 95, 130, 80, 125, 165, 55, 100, 80, 50, 65, 65, 65, 65, 65, 75, 105, 125, 120, 120, 90, 160, 100, 120, 70, 80, 100, 90, 100, 103, 120, 100, 45, 60, 75, 63, 93, 123, 55, 75, 100, 55, 85, 60, 80, 110, 50, 88, 53, 98, 53, 98, 53, 98, 25, 55, 55, 77, 115, 60, 100, 75, 105, 135, 45, 57, 85, 135, 60, 60, 80, 105, 140, 50, 65, 95, 100, 125, 53, 63, 103, 45, 55, 100, 27, 67, 35, 60, 92, 72, 82, 117, 90, 140, 30, 86, 65, 95, 75, 90, 58, 30, 50, 78, 108, 112, 140, 50, 95, 65, 105, 50, 95, 30, 45, 55, 30, 40, 65, 44, 87, 50, 65, 95, 60, 100, 75, 75, 135, 55, 85, 40, 60, 75, 47, 77, 50, 94, 55, 80, 100)
pokemon[,2] <- c( v2, 55, 85, 115, 55, 75, 30, 40, 55, 87, 117, 147, 70, 110, 50, 40, 70, 66, 85, 125, 120, 74, 124, 85, 125, 110, 83, 123, 55, 65, 97, 109, 65, 85, 105, 85, 60, 90, 129, 90, 115, 100, 115, 105, 120, 150, 125, 145, 130, 170, 120, 72, 72, 77, 128, 120, 61, 78, 107, 45, 59, 69, 56, 63, 95, 36, 56, 50, 73, 81, 35, 22, 52, 50, 68, 38, 45, 65, 65, 100, 82, 124, 80, 48, 48, 48, 80, 110, 150, 50, 52, 72, 48, 80, 54, 92, 52, 105, 60, 75, 53, 73, 38, 55, 89, 121, 59, 77, 65, 92, 58, 50, 50, 75, 100, 80, 70, 110, 66, 66, 66, 66, 90, 85, 95, 100, 69, 117, 30, 70, 131, 131, 100, 100, 160, 110, 160, 110 )
v3 <- c(49, 63, 83, 123, 43, 58, 78, 111, 78, 65, 80, 100, 120, 35, 55, 50, 30, 50, 40, 40, 40, 55, 75, 80, 35, 60, 30, 65, 44, 69, 40, 55, 85, 110, 52, 67, 87, 40, 57, 77, 48, 73, 40, 75, 20, 45, 35, 70, 55, 70, 85, 55, 80, 50, 60, 25, 50, 35, 60, 48, 78, 35, 60, 45, 80, 40, 65, 95, 15, 30, 45, 65, 50, 70, 80, 35, 50, 65, 35, 65, 100, 115, 130, 55, 70, 65, 110, 180, 70, 95, 55, 45, 70, 55, 80, 50, 75, 100, 180, 30, 45, 60, 80, 160, 45, 70, 90, 115, 50, 70, 80, 85, 95, 110, 53, 79, 75, 95, 120, 95, 120, 5, 115, 80, 100, 70, 95, 60, 65, 55, 85, 65, 80, 35, 57, 57, 100, 120, 95, 55, 79, 109, 80, 48, 50, 60, 60, 60, 70, 100, 125, 90, 105, 65, 85, 65, 100, 85, 90, 45, 65, 95, 90, 100, 70, 100, 65, 80, 100, 43, 58, 78, 64, 80, 100, 34, 64, 30, 50, 30, 50, 40, 70, 80, 38, 58, 15, 28, 15, 65, 85, 45, 70, 40, 55, 85, 105, 95, 50, 80, 115, 75, 40, 50, 70, 55, 30, 55, 45, 45, 85, 60, 110, 42, 80, 60, 48, 58, 65, 90, 140, 70, 105, 200, 230, 50, 75, 75, 100, 140, 230, 75, 115, 55, 50, 75, 40, 120, 40, 80, 85, 35, 75, 45, 70, 140, 30, 50, 90, 95, 60, 120, 90, 62, 35, 35, 95, 15, 37, 37, 105, 10, 75, 85, 115, 50, 70, 110, 150, 130, 90, 100, 35, 45, 65, 75, 40, 60, 70, 80, 50, 70, 90, 110, 35, 70, 41, 61, 35, 55, 50, 55, 70, 30, 50, 70, 50, 40, 60, 30, 60, 30, 100, 25, 35, 65, 65, 32, 62, 60, 80, 60, 80, 100, 90, 45, 45, 23, 43, 63, 30, 60, 40, 135, 45, 65, 75, 125, 85, 125, 100, 140, 180, 230, 55, 75, 85, 40, 60, 80, 40, 50, 55, 55, 45, 53, 83, 20, 40, 70, 35, 45, 40, 70, 100, 140, 35, 65, 60, 45, 50, 80, 40, 60, 60, 90, 110, 60, 60, 65, 85, 43, 73, 65, 85, 55, 105, 77, 97, 50, 100, 20, 79, 70, 70, 35, 65, 75, 90, 130, 83, 70, 60, 60, 48, 50, 80, 80, 50, 70, 90, 85, 105, 105, 130, 55, 60, 100, 80, 130, 80, 100, 130, 150, 200, 100, 150, 90, 120, 80, 100, 90, 90, 140, 160, 90, 100, 100, 50, 20, 160, 90, 64, 85, 105, 44, 52, 71, 53, 68, 88, 30, 50, 70, 40, 60, 41, 51, 34, 49, 79, 35, 65, 40, 60, 118, 168, 45, 85, 105, 95, 50, 42, 102, 70, 35, 55, 45, 70, 48, 68, 66, 34, 44, 44, 84, 94, 60, 52, 42, 64, 50, 47, 67, 86, 116, 95, 45, 5, 45, 108, 45, 65, 95, 115, 40, 40, 70, 88, 78, 118, 90, 110, 40, 65, 72, 56, 76, 50, 50, 75, 105, 65, 115, 95, 130, 125, 67, 67, 95, 86, 130, 110, 125, 80, 70, 65, 95, 145, 135, 70, 77, 107, 107, 107, 107, 107, 130, 105, 70, 120, 100, 106, 110, 120, 100, 120, 80, 100, 90, 100, 75, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 85, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 45, 85, 50, 62, 80, 32, 63, 85, 105, 130, 43, 55, 40, 60, 86, 126, 55, 85, 95, 40, 55, 75, 85, 75, 70, 90, 80, 59, 99, 89, 60, 85, 50, 75, 65, 35, 45, 80, 45, 55, 105, 67, 85, 125, 70, 115, 80, 85, 145, 103, 133, 45, 65, 62, 82, 40, 60, 40, 60, 50, 70, 95, 40, 50, 75, 50, 63, 50, 65, 85, 50, 70, 60, 45, 105, 45, 70, 50, 70, 80, 50, 60, 91, 131, 70, 95, 115, 40)
pokemon[,3] <- c( v3, 70, 80, 55, 75, 55, 60, 90, 60, 70, 90, 40, 80, 30, 85, 40, 84, 50, 60, 90, 50, 80, 70, 100, 95, 50, 75, 75, 105, 66, 112, 50, 70, 90, 55, 65, 129, 90, 72, 70, 80, 70, 70, 100, 120, 90, 90, 90, 100, 90, 90, 90, 77, 90, 95, 65, 95, 122, 40, 58, 72, 40, 52, 67, 38, 77, 43, 55, 71, 40, 60, 50, 58, 72, 39, 47, 68, 48, 62, 62, 78, 60, 54, 76, 76, 100, 150, 50, 150, 60, 72, 66, 86, 53, 88, 67, 115, 60, 90, 62, 88, 33, 52, 77, 119, 50, 72, 65, 75, 57, 150, 35, 53, 70, 91, 48, 76, 70, 70, 70, 70, 122, 122, 122, 122, 85, 184, 35, 80, 95, 95, 121, 150, 110, 60, 60, 120 )
v4 <- c(65, 80, 100, 122, 60, 80, 109, 130, 159, 50, 65, 85, 135, 20, 25, 90, 20, 25, 45, 15, 35, 50, 70, 135, 25, 50, 31, 61, 40, 65, 50, 90, 20, 45, 40, 55, 75, 40, 55, 85, 60, 95, 50, 81, 45, 85, 30, 65, 75, 85, 110, 45, 60, 40, 90, 35, 50, 40, 65, 65, 95, 35, 60, 70, 100, 40, 50, 70, 105, 120, 135, 175, 35, 50, 65, 70, 85, 100, 50, 80, 30, 45, 55, 65, 80, 40, 100, 130, 95, 120, 58, 35, 60, 45, 70, 40, 65, 45, 85, 100, 115, 130, 170, 30, 43, 73, 25, 50, 55, 80, 60, 125, 40, 50, 35, 35, 60, 60, 85, 30, 45, 35, 100, 40, 60, 70, 95, 35, 65, 70, 100, 100, 55, 115, 95, 100, 55, 65, 40, 15, 60, 70, 85, 48, 45, 110, 110, 95, 85, 90, 115, 55, 65, 60, 70, 65, 95, 125, 125, 50, 70, 100, 154, 154, 194, 100, 49, 63, 83, 60, 80, 109, 44, 59, 79, 35, 45, 36, 76, 40, 55, 40, 60, 70, 56, 76, 35, 45, 40, 40, 80, 70, 95, 65, 80, 115, 165, 90, 20, 60, 30, 90, 35, 45, 55, 40, 30, 105, 75, 25, 65, 130, 60, 85, 100, 85, 72, 33, 90, 35, 60, 65, 35, 55, 55, 40, 60, 55, 55, 65, 10, 40, 40, 35, 50, 75, 70, 80, 30, 60, 65, 65, 105, 65, 80, 40, 80, 110, 140, 95, 40, 60, 105, 85, 20, 35, 35, 85, 65, 70, 40, 75, 115, 90, 90, 45, 65, 95, 95, 90, 110, 100, 65, 85, 105, 145, 70, 85, 110, 130, 50, 60, 85, 95, 30, 60, 30, 50, 20, 25, 100, 25, 50, 40, 60, 90, 30, 60, 90, 30, 50, 55, 85, 45, 65, 125, 165, 50, 80, 40, 60, 35, 55, 95, 30, 50, 30, 51, 71, 91, 20, 40, 20, 45, 35, 55, 65, 85, 55, 55, 40, 50, 60, 60, 40, 60, 80, 65, 105, 135, 85, 75, 47, 73, 100, 43, 73, 65, 95, 110, 70, 90, 65, 105, 145, 85, 70, 90, 60, 45, 50, 80, 85, 115, 40, 70, 110, 60, 100, 95, 55, 46, 76, 50, 90, 40, 70, 61, 81, 40, 70, 10, 100, 70, 60, 63, 83, 93, 30, 60, 72, 95, 75, 115, 23, 50, 80, 120, 55, 75, 95, 74, 94, 114, 45, 40, 40, 60, 110, 120, 35, 55, 95, 105, 50, 100, 75, 110, 140, 130, 160, 150, 180, 100, 150, 150, 180, 100, 150, 180, 70, 95, 45, 55, 75, 58, 78, 104, 61, 81, 111, 30, 40, 50, 35, 55, 25, 55, 40, 60, 95, 50, 125, 30, 65, 42, 47, 29, 79, 59, 69, 94, 30, 80, 45, 60, 85, 62, 87, 57, 92, 60, 60, 90, 44, 54, 54, 105, 105, 42, 64, 65, 41, 71, 24, 79, 10, 70, 15, 92, 92, 40, 50, 80, 120, 40, 35, 115, 140, 38, 68, 30, 60, 61, 86, 90, 49, 69, 60, 62, 92, 132, 45, 130, 80, 55, 110, 95, 125, 120, 116, 60, 130, 45, 70, 135, 65, 65, 75, 65, 80, 95, 105, 105, 105, 105, 105, 75, 105, 125, 150, 150, 130, 80, 100, 120, 75, 80, 100, 135, 100, 120, 120, 100, 45, 60, 75, 45, 70, 100, 63, 83, 108, 35, 60, 25, 35, 45, 50, 88, 53, 98, 53, 98, 53, 98, 67, 107, 36, 50, 65, 50, 80, 25, 50, 60, 55, 77, 30, 50, 60, 80, 25, 40, 55, 50, 65, 85, 30, 30, 40, 50, 70, 30, 40, 55, 37, 77, 70, 110, 80, 35, 45, 65, 15, 30, 140, 106, 35, 65, 35, 45, 103, 55, 95, 53, 83, 74, 112, 40, 60, 80, 120, 40, 65, 55, 75, 95, 105, 125, 125, 44, 87, 65, 80, 110, 40, 60, 75, 40, 60, 55, 85, 65, 85, 40, 57, 97, 24, 54, 45, 70, 70)
pokemon[,4] <- c( v4, 45, 75, 105, 85, 125, 65, 95, 145, 30, 40, 60, 60, 70, 95, 40, 100, 81, 55, 95, 60, 35, 55, 40, 60, 40, 37, 57, 45, 55, 105, 48, 45, 65, 125, 50, 135, 90, 72, 90, 125, 110, 125, 145, 150, 120, 115, 105, 130, 120, 170, 129, 129, 128, 77, 120, 48, 56, 74, 62, 90, 114, 62, 83, 103, 32, 50, 40, 56, 74, 27, 27, 90, 73, 109, 61, 75, 112, 62, 97, 46, 69, 65, 63, 83, 83, 35, 45, 150, 50, 63, 99, 59, 85, 37, 68, 39, 54, 60, 97, 58, 120, 61, 109, 45, 69, 67, 99, 110, 74, 81, 50, 55, 83, 110, 80, 50, 65, 44, 44, 44, 44, 58, 58, 58, 58, 32, 44, 45, 97, 131, 131, 81, 100, 160, 150, 170, 130 )
v5 <- c(65, 80, 100, 120, 50, 65, 85, 85, 115, 64, 80, 105, 115, 20, 25, 80, 20, 25, 80, 80, 35, 50, 70, 80, 35, 70, 31, 61, 54, 79, 50, 80, 30, 55, 40, 55, 85, 40, 55, 75, 65, 90, 65, 100, 25, 50, 40, 75, 65, 75, 90, 55, 80, 55, 75, 45, 70, 40, 65, 50, 80, 45, 70, 50, 80, 40, 50, 90, 55, 70, 95, 95, 35, 60, 85, 30, 45, 70, 100, 120, 30, 45, 65, 65, 80, 40, 80, 80, 55, 70, 62, 35, 60, 70, 95, 50, 100, 25, 45, 35, 55, 75, 95, 45, 90, 115, 25, 50, 55, 80, 45, 65, 50, 80, 110, 110, 75, 45, 70, 30, 45, 105, 40, 80, 100, 25, 45, 50, 80, 55, 85, 120, 80, 95, 85, 85, 70, 90, 70, 20, 100, 130, 95, 48, 65, 95, 95, 110, 75, 55, 70, 45, 70, 75, 95, 110, 125, 90, 85, 50, 70, 100, 90, 100, 120, 100, 65, 80, 100, 50, 65, 85, 48, 63, 83, 45, 55, 56, 96, 80, 110, 40, 60, 80, 56, 76, 35, 55, 20, 65, 105, 45, 70, 45, 60, 90, 110, 100, 50, 80, 65, 100, 55, 65, 95, 55, 30, 85, 45, 25, 65, 95, 130, 42, 110, 85, 48, 58, 65, 35, 60, 65, 65, 65, 95, 40, 60, 55, 80, 100, 230, 95, 105, 75, 50, 75, 40, 80, 30, 60, 85, 35, 75, 45, 140, 70, 50, 80, 90, 95, 40, 60, 95, 65, 45, 35, 110, 65, 55, 55, 70, 135, 100, 75, 115, 50, 70, 100, 120, 154, 154, 100, 55, 65, 85, 85, 50, 60, 70, 80, 50, 70, 90, 110, 30, 60, 41, 61, 30, 25, 50, 25, 90, 50, 70, 100, 30, 40, 60, 30, 50, 30, 70, 35, 55, 115, 135, 52, 82, 60, 60, 35, 55, 65, 30, 50, 30, 23, 43, 73, 30, 60, 40, 90, 35, 55, 65, 115, 55, 95, 40, 50, 60, 80, 55, 75, 85, 40, 60, 80, 75, 85, 75, 75, 80, 53, 83, 20, 40, 65, 35, 45, 45, 75, 105, 70, 80, 110, 60, 45, 50, 80, 40, 60, 75, 105, 105, 60, 60, 85, 65, 41, 71, 35, 55, 70, 120, 87, 107, 50, 80, 55, 125, 70, 120, 33, 63, 83, 90, 130, 87, 80, 60, 60, 48, 50, 80, 80, 50, 70, 90, 55, 75, 75, 65, 65, 30, 50, 80, 90, 60, 80, 90, 110, 100, 200, 150, 130, 150, 110, 120, 140, 160, 90, 90, 90, 100, 100, 50, 20, 160, 90, 55, 65, 85, 44, 52, 71, 56, 76, 101, 30, 40, 60, 40, 60, 41, 51, 34, 49, 79, 70, 105, 30, 50, 88, 138, 45, 105, 85, 95, 50, 42, 102, 90, 30, 50, 53, 78, 62, 82, 66, 44, 54, 56, 96, 96, 105, 52, 37, 59, 50, 41, 61, 86, 116, 45, 90, 65, 42, 108, 45, 55, 85, 95, 85, 40, 70, 70, 42, 72, 55, 75, 40, 65, 72, 61, 86, 120, 60, 85, 105, 85, 90, 95, 55, 50, 85, 95, 115, 56, 65, 95, 75, 60, 75, 115, 115, 150, 135, 70, 77, 107, 107, 107, 107, 107, 130, 105, 70, 100, 120, 106, 110, 120, 100, 130, 80, 100, 90, 100, 75, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 70, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 55, 95, 30, 42, 55, 32, 63, 25, 40, 80, 43, 55, 45, 65, 86, 126, 35, 50, 65, 40, 55, 75, 85, 75, 60, 80, 80, 39, 79, 69, 50, 75, 50, 75, 55, 35, 45, 70, 45, 55, 105, 67, 35, 75, 70, 115, 80, 65, 105, 45, 65, 45, 65, 62, 82, 40, 60, 40, 60, 65, 85, 110, 50, 60, 85, 50, 63, 60, 75, 95, 50, 70, 60, 45, 105, 55, 80, 85, 105, 45, 50, 60, 86, 116, 60, 85, 85, 40)
pokemon[,5] <- c( v5, 70, 80, 55, 95, 55, 60, 90, 40, 50, 70, 40, 80, 135, 65, 60, 99, 50, 60, 90, 50, 80, 40, 70, 95, 50, 75, 65, 95, 66, 48, 50, 70, 90, 55, 105, 72, 90, 129, 80, 90, 80, 80, 120, 100, 80, 80, 90, 90, 100, 90, 90, 128, 77, 95, 45, 58, 75, 60, 70, 100, 44, 56, 71, 36, 77, 38, 52, 69, 25, 30, 50, 54, 66, 79, 98, 154, 57, 81, 48, 71, 90, 60, 81, 81, 37, 49, 50, 150, 65, 89, 57, 75, 46, 75, 56, 86, 60, 123, 63, 89, 43, 94, 45, 59, 63, 92, 130, 63, 67, 150, 75, 113, 150, 87, 60, 82, 55, 55, 55, 55, 75, 75, 75, 75, 35, 46, 40, 80, 98, 98, 95, 150, 110, 130, 130, 90 )
v6 <- c(45, 60, 80, 80, 65, 80, 100, 100, 100, 43, 58, 78, 78, 45, 30, 70, 50, 35, 75, 145, 56, 71, 101, 121, 72, 97, 70, 100, 55, 80, 90, 110, 40, 65, 41, 56, 76, 50, 65, 85, 35, 60, 65, 100, 20, 45, 55, 90, 30, 40, 50, 25, 30, 45, 90, 95, 120, 90, 115, 55, 85, 70, 95, 60, 95, 90, 90, 70, 90, 105, 120, 150, 35, 45, 55, 40, 55, 70, 70, 100, 20, 35, 45, 90, 105, 15, 30, 30, 45, 70, 60, 75, 100, 45, 70, 25, 50, 40, 70, 80, 95, 110, 130, 70, 42, 67, 50, 75, 100, 140, 40, 55, 35, 45, 87, 76, 30, 35, 60, 25, 40, 50, 60, 90, 100, 60, 85, 63, 68, 85, 115, 90, 105, 95, 105, 93, 85, 105, 110, 80, 81, 81, 60, 48, 55, 65, 130, 65, 40, 35, 55, 55, 80, 130, 150, 30, 85, 100, 90, 50, 70, 80, 130, 130, 140, 100, 45, 60, 80, 65, 80, 100, 43, 58, 78, 20, 90, 50, 70, 55, 85, 30, 40, 130, 67, 67, 60, 15, 15, 20, 40, 70, 95, 35, 45, 55, 45, 50, 40, 50, 30, 70, 50, 80, 110, 85, 30, 30, 95, 15, 35, 110, 65, 91, 30, 85, 48, 33, 85, 15, 40, 45, 85, 30, 30, 30, 45, 85, 65, 75, 5, 85, 75, 115, 40, 55, 20, 30, 50, 50, 35, 65, 45, 75, 70, 70, 65, 95, 115, 85, 40, 50, 60, 85, 75, 35, 70, 65, 95, 83, 100, 55, 115, 100, 85, 41, 51, 61, 71, 110, 90, 100, 70, 95, 120, 145, 45, 55, 80, 100, 40, 50, 60, 70, 35, 70, 60, 100, 20, 15, 65, 15, 65, 30, 50, 70, 30, 60, 80, 85, 125, 85, 65, 40, 50, 80, 100, 65, 60, 35, 70, 30, 90, 100, 40, 160, 40, 28, 48, 68, 25, 50, 20, 30, 50, 70, 50, 20, 50, 50, 30, 40, 50, 50, 60, 80, 100, 65, 105, 135, 95, 95, 85, 85, 65, 40, 55, 65, 95, 105, 60, 60, 35, 40, 20, 20, 60, 80, 60, 10, 70, 100, 35, 55, 50, 80, 80, 90, 65, 70, 70, 60, 60, 35, 55, 55, 75, 23, 43, 75, 45, 80, 81, 70, 40, 45, 65, 75, 25, 25, 51, 65, 75, 115, 23, 50, 80, 100, 25, 45, 65, 32, 52, 52, 55, 97, 50, 50, 100, 120, 30, 50, 70, 110, 50, 50, 50, 110, 110, 110, 110, 90, 90, 90, 90, 95, 115, 100, 150, 150, 90, 180, 31, 36, 56, 61, 81, 108, 40, 50, 60, 60, 80, 100, 31, 71, 25, 65, 45, 60, 70, 55, 90, 58, 58, 30, 30, 36, 36, 36, 36, 66, 70, 40, 95, 85, 115, 35, 85, 34, 39, 115, 70, 80, 85, 105, 135, 105, 71, 85, 112, 45, 74, 84, 23, 33, 10, 60, 30, 91, 35, 42, 82, 102, 92, 5, 60, 90, 112, 32, 47, 65, 95, 50, 85, 46, 66, 91, 50, 40, 60, 30, 125, 60, 50, 40, 50, 95, 83, 80, 95, 95, 65, 95, 80, 90, 80, 110, 40, 45, 110, 91, 86, 86, 86, 86, 86, 95, 80, 115, 90, 100, 77, 100, 90, 90, 85, 80, 100, 125, 100, 127, 120, 100, 63, 83, 113, 45, 55, 65, 45, 60, 70, 42, 77, 55, 60, 80, 66, 106, 64, 101, 64, 101, 64, 101, 24, 29, 43, 65, 93, 76, 116, 15, 20, 25, 72, 114, 68, 88, 50, 50, 35, 40, 45, 64, 69, 74, 45, 85, 42, 42, 92, 57, 47, 112, 66, 116, 30, 90, 98, 65, 74, 92, 50, 95, 55, 60, 55, 45, 48, 58, 97, 30, 30, 22, 32, 70, 110, 65, 75, 65, 105, 75, 115, 45, 55, 65, 20, 30, 30, 55, 98, 44, 59, 79, 75, 95, 103, 60, 20, 15, 30, 40, 60, 65, 65, 108, 10, 20, 30, 50, 90, 60)
pokemon[,6] <- c( v6, 40, 50, 30, 40, 20, 55, 80, 57, 67, 97, 40, 50, 105, 25, 145, 32, 65, 105, 48, 35, 55, 60, 70, 55, 60, 80, 60, 80, 65, 109, 38, 58, 98, 60, 100, 108, 108, 108, 111, 121, 111, 101, 90, 90, 101, 91, 95, 95, 95, 108, 108, 90, 128, 99, 38, 57, 64, 60, 73, 104, 71, 97, 122, 57, 78, 62, 84, 126, 35, 29, 89, 72, 106, 42, 52, 75, 52, 68, 43, 58, 102, 68, 104, 104, 28, 35, 60, 60, 23, 29, 49, 72, 45, 73, 50, 68, 30, 44, 44, 59, 70, 109, 48, 71, 46, 58, 60, 118, 101, 50, 40, 60, 80, 75, 38, 56, 51, 56, 46, 41, 84, 99, 69, 54, 28, 28, 55, 123, 99, 99, 95, 50, 110, 70, 80, 70 )
colnames(pokemon) <- c("HitPoints", "Attack", "Defense", "SpecialAttack", "SpecialDefense", "Speed")
str(pokemon)
## num [1:800, 1:6] 45 60 80 80 39 58 78 78 78 44 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "HitPoints" "Attack" "Defense" "SpecialAttack" ...
apply(pokemon, 2, FUN=mean)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 69.25875 79.00125 73.84250 72.82000 71.90250
## Speed
## 68.27750
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Select number of clusters
k <- 3
# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 355, 175, 270
##
## Cluster means:
## HitPoints Attack Defense SpecialAttack SpecialDefense Speed
## 1 54.68732 56.93239 53.64507 52.02254 53.04789 53.58873
## 2 79.30857 97.29714 108.93143 66.71429 87.04571 57.29143
## 3 81.90370 96.15926 77.65556 104.12222 86.87778 94.71111
##
## Clustering vector:
## [1] 1 1 3 3 1 1 3 3 3 1 1 2 3 1 1 1 1 1 1 3 1 1 3 3 1 1 1 3 1 3 1 3 1 2 1
## [36] 1 2 1 1 3 1 3 1 3 1 1 1 3 1 1 3 1 2 1 3 1 1 1 3 1 3 1 3 1 3 1 1 2 1 3
## [71] 3 3 1 2 2 1 1 3 1 3 1 2 2 1 3 1 2 2 1 3 1 1 3 1 2 1 2 1 2 1 3 3 3 2 1
## [106] 2 1 2 1 3 1 3 1 2 2 2 1 1 2 1 2 1 2 2 2 1 3 1 2 1 3 3 3 3 3 3 2 2 2 1
## [141] 2 2 2 1 1 3 3 3 1 1 2 1 2 3 3 2 3 3 3 1 1 3 3 3 3 3 1 1 2 1 1 3 1 1 2
## [176] 1 1 1 3 1 1 1 1 3 1 3 1 1 1 1 1 1 3 1 1 3 3 2 1 1 2 3 1 1 3 1 1 1 1 1
## [211] 2 3 2 1 2 3 1 1 3 1 2 1 2 2 2 1 2 1 2 2 2 2 2 1 1 2 1 2 1 2 1 1 3 1 3
## [246] 2 1 3 3 3 1 2 3 3 1 1 2 1 1 1 2 3 3 3 2 1 1 2 2 3 3 3 1 1 3 3 1 1 3 3
## [281] 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 2 1 1 3 3 1 1 1 2 1 1 3 1
## [316] 3 1 1 1 3 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 1 3 1 3 3 1 1 1 1 1 1 2 1 3 3
## [351] 1 3 1 3 2 2 1 3 1 1 1 3 1 3 1 2 3 3 3 3 2 1 2 1 2 1 2 1 2 1 2 1 3 1 2
## [386] 1 3 3 1 2 2 1 3 3 1 1 3 3 1 1 3 1 2 2 2 1 1 2 3 3 1 2 2 3 2 2 2 3 3 3
## [421] 3 3 3 2 3 3 3 3 3 3 2 3 1 2 2 1 1 3 1 1 3 1 1 3 1 1 1 1 1 1 3 1 3 1 2
## [456] 1 2 1 2 2 2 3 1 2 1 1 3 1 3 1 2 3 1 3 1 3 3 3 3 1 3 1 1 3 1 2 1 1 1 1
## [491] 2 1 1 3 3 1 1 3 3 1 2 1 2 1 3 2 1 3 1 1 3 2 3 3 2 2 2 3 3 3 3 2 3 2 3
## [526] 3 3 3 2 2 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 1 1 3 1 1 3
## [561] 1 1 3 1 1 1 1 2 1 3 1 3 1 3 1 3 1 2 1 1 3 1 3 1 2 2 1 3 1 3 2 2 1 2 2
## [596] 1 1 3 2 2 1 1 3 1 1 3 1 3 1 3 3 1 1 3 1 2 3 3 1 2 1 2 3 1 2 1 2 1 3 1
## [631] 2 1 3 1 3 1 1 2 1 1 3 1 3 1 1 3 1 3 3 1 2 1 2 1 3 2 1 3 1 2 1 2 2 1 1
## [666] 3 1 3 1 1 3 1 2 2 1 2 3 1 3 2 1 3 2 1 2 1 2 2 1 2 1 2 3 2 1 1 3 1 3 3
## [701] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 2 1 1 3 1 1 3 1 1 1 1 3 1 1 1
## [736] 1 3 1 1 3 1 3 1 2 3 1 3 3 1 2 3 2 1 2 1 3 1 2 1 2 1 2 1 3 1 3 1 2 1 3
## [771] 3 3 3 2 1 3 3 2 1 2 1 1 1 1 2 2 2 2 1 2 1 3 3 3 2 2 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 812079.9 709020.5 1018348.0
## (between_SS / total_SS = 40.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
col = km.out$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
Chapter 2 - Hierarchical Clustering
Introduction to hierarchical clustering - creating clusters when the number of clusters is not known ahead of time:
Selecting the number of clusters - dendrograms (trees):
Clustering linkage and practical matters - how to determine distances between clusters:
Example code includes:
x <- matrix(data=NA, nrow=50, ncol=2)
x[, 1] <- c( 3.37, 1.44, 2.36, 2.63, 2.4, 1.89, 3.51, 1.91, 4.02, 1.94, 3.3, 4.29, 0.61, 1.72,
1.87, 2.64, 1.72, -0.66, -0.44, 3.32, 1.69, 0.22, 1.83, 3.21, 3.9, -5.43, -5.26,
-6.76, -4.54, -5.64, -4.54, -4.3, -3.96, -5.61, -4.5, -1.72, -0.78, -0.85, -2.41,
0.04, 0.21, -0.36, 0.76, -0.73, -1.37, 0.43, -0.81, 1.44, -0.43, 0.66
)
x[, 2] <- c( 2.32, 1.22, 3.58, 2.64, 2.09, 2.28, 2.68, 2.09, -0.99, 2.28, 1.63, 2.19, 2.58, 3.4,
1.27, 3.3, 2.34, 3.04, 2.92, 2.72, 0.96, 1.91, 2.62, 1.05, 1.46, 2.58, 2.77, 2.46,
1.11, 0.9, 3.51, 2.26, 2.09, 1.88, 0.81, -1.39, -2.22, -2.18, -1.07, -1.18, -0.61,
-2.48, -1.35, -0.61, -3.11, -2.86, -3.13, -3.46, -1.92, -1.35
)
str(x)
## num [1:50, 1:2] 3.37 1.44 2.36 2.63 2.4 1.89 3.51 1.91 4.02 1.94 ...
# Create hierarchical clustering model: hclust.out
hclust.out <- hclust(d=dist(x))
# Inspect the result
summary(hclust.out)
## Length Class Mode
## merge 98 -none- numeric
## height 49 -none- numeric
## order 50 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
# Cut by height
cutree(hclust.out, h=7)
## [1] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# Cut by number of clusters
cutree(hclust.out, k=3)
## [1] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(x), method="complete")
# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(x), method="average")
# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(x), method="single")
# Plot dendrogram of hclust.complete
plot(hclust.complete)
# Plot dendrogram of hclust.average
plot(hclust.average)
# Plot dendrogram of hclust.single
plot(hclust.single)
# View column means
colMeans(pokemon)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 69.25875 79.00125 73.84250 72.82000 71.90250
## Speed
## 68.27750
# View column standard deviations
apply(pokemon, 2, FUN=sd)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 25.53467 32.45737 31.18350 32.72229 27.82892
## Speed
## 29.06047
# Scale the data
pokemon.scaled <- scale(pokemon)
# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method="complete")
Chapter 3 - Dimensionality Reduction with PCA
Introduction to PCA - a popular type of dimensionality reduction to find structure in features, and aid in visualization:
Visualizing and intepreting PCA results:
Practical issues with PCA - scaling, missing values (drop and/or impute), categorical data (drop or encode as numbers):
Example code includes:
pokemon <- matrix(nrow=50, ncol=4, byrow=FALSE,
data=c( 58, 90, 70, 60, 60, 44, 100, 80, 80, 60, 150, 62, 75, 70, 115, 74, 74,
40, 95, 80, 25, 51, 48, 45, 35, 20, 60, 70, 70, 80, 57, 64, 75, 101, 50,
60, 85, 95, 58, 100, 95, 91, 62, 70, 60, 70, 50, 50, 70, 150, 64, 100,
94, 80, 55, 38, 77, 145, 100, 55, 100, 77, 98, 130, 45, 108, 94, 35, 65,
120, 35, 65, 72, 45, 55, 40, 70, 20, 55, 100, 24, 78, 98, 72, 75, 100, 120,
155, 89, 150, 125, 90, 48, 40, 110, 85, 85, 50, 110, 120, 58, 70, 50, 110,
90, 33, 77, 150, 70, 145, 120, 62, 63, 100, 20, 133, 131, 30, 65, 130, 70,
65, 48, 55, 40, 90, 50, 50, 65, 80, 86, 52, 63, 72, 70, 89, 70, 109, 77,
120, 79, 129, 54, 50, 70, 140, 40, 62, 70, 100, 80, 80, 66, 45, 80, 70,
90, 110, 95, 40, 90, 65, 101, 65, 20, 32, 20, 105, 60, 45, 45, 59, 48, 63,
60, 25, 65, 40, 70, 100, 23, 81, 101, 29, 48, 112, 100, 81, 48, 90, 81,
108, 68, 25, 100, 20, 35, 65, 90, 90
)
)
colnames(pokemon) <- c( "HitPoint", "Attack", "Defense", "Speed" )
rownames(pokemon) <- c( 'Quilava', 'Goodra', 'Mothim', 'Marowak', 'Chandelure', 'Helioptile',
'MeloettaAria Forme', 'MetagrossMega Metagross', 'Sawsbuck', 'Probopass',
'GiratinaAltered Forme', 'Tranquill', 'Simisage', 'Scizor', 'Jigglypuff',
'Carracosta', 'Ferrothorn', 'Kadabra', 'Sylveon', 'Golem', 'Magnemite',
'Vanillish', 'Unown', 'Snivy', 'Tynamo', 'Duskull', 'Beautifly', 'Marill',
'Lunatone', 'Flygon', 'Bronzor', 'Monferno', 'Simisear', 'Aromatisse',
'Scraggy', 'Scolipede', 'Staraptor', 'GyaradosMega Gyarados', 'Tyrunt', 'Zekrom',
'Gyarados', 'Cobalion', 'Espurr', 'Spheal', 'Dodrio', 'Torkoal', 'Cacnea',
'Trubbish', 'Lucario', 'GiratinaOrigin Forme'
)
str(pokemon)
## num [1:50, 1:4] 58 90 70 60 60 44 100 80 80 60 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Quilava" "Goodra" "Mothim" "Marowak" ...
## ..$ : chr [1:4] "HitPoint" "Attack" "Defense" "Speed"
colMeans(pokemon)
## HitPoint Attack Defense Speed
## 71.08 81.22 78.44 66.58
head(pokemon)
## HitPoint Attack Defense Speed
## Quilava 58 64 58 80
## Goodra 90 100 70 80
## Mothim 70 94 50 66
## Marowak 60 80 110 45
## Chandelure 60 55 90 80
## Helioptile 44 38 33 70
# Perform scaled PCA: pr.out
pr.out <- prcomp(pokemon, scale=TRUE)
# Inspect model output
summary(pr.out)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.4420 1.0013 0.7941 0.53595
## Proportion of Variance 0.5199 0.2507 0.1577 0.07181
## Cumulative Proportion 0.5199 0.7705 0.9282 1.00000
biplot(pr.out)
# Variability of each principal component: pr.var
pr.var <- (pr.out$sdev)^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cummulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
pokeTotal <- matrix(ncol=1, nrow=50,
data=c( 405, 600, 424, 425, 520, 289, 600, 700, 475, 525, 680, 358, 498, 500,
270, 495, 489, 400, 525, 495, 325, 395, 336, 308, 275, 295, 395, 250,
440, 520, 300, 405, 498, 462, 348, 485, 485, 640, 362, 680, 540, 580,
355, 290, 460, 470, 335, 329, 525, 680
)
)
pokemon <- cbind(pokeTotal, pokemon)
colnames(pokemon)[1] <- "Total"
str(pokemon)
## num [1:50, 1:5] 405 600 424 425 520 289 600 700 475 525 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Quilava" "Goodra" "Mothim" "Marowak" ...
## ..$ : chr [1:5] "Total" "HitPoint" "Attack" "Defense" ...
colMeans(pokemon)
## Total HitPoint Attack Defense Speed
## 448.82 71.08 81.22 78.44 66.58
# Mean of each variable
colMeans(pokemon)
## Total HitPoint Attack Defense Speed
## 448.82 71.08 81.22 78.44 66.58
# Standard deviation of each variable
apply(pokemon, 2, sd)
## Total HitPoint Attack Defense Speed
## 119.32321 25.62193 33.03078 32.05809 27.51036
# PCA model with scaling: pr.with.scaling
pr.with.scaling <- prcomp(pokemon, scale=TRUE)
# PCA model without scaling: pr.without.scaling
pr.without.scaling <- prcomp(pokemon, scale=FALSE)
# Create biplots of both for comparison
biplot(pr.with.scaling)
biplot(pr.without.scaling)
Chapter 4 - Case Study
Introduction to the case study:
PCA Review and Next Steps:
Example code includes:
# Cached to avoid repeated downloads
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"
# Download the data: wisc.df
wisc.df <- read.csv(url, stringsAsFactors=FALSE)
# Convert the features of the data: wisc.data
wisc.data <- as.matrix(wisc.df[, 3:32])
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id
# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")
And, continuing with:
# Check column means and standard deviations
colMeans(wisc.data)
## radius_mean texture_mean perimeter_mean
## 1.412729e+01 1.928965e+01 9.196903e+01
## area_mean smoothness_mean compactness_mean
## 6.548891e+02 9.636028e-02 1.043410e-01
## concavity_mean concave.points_mean symmetry_mean
## 8.879932e-02 4.891915e-02 1.811619e-01
## fractal_dimension_mean radius_se texture_se
## 6.279761e-02 4.051721e-01 1.216853e+00
## perimeter_se area_se smoothness_se
## 2.866059e+00 4.033708e+01 7.040979e-03
## compactness_se concavity_se concave.points_se
## 2.547814e-02 3.189372e-02 1.179614e-02
## symmetry_se fractal_dimension_se radius_worst
## 2.054230e-02 3.794904e-03 1.626919e+01
## texture_worst perimeter_worst area_worst
## 2.567722e+01 1.072612e+02 8.805831e+02
## smoothness_worst compactness_worst concavity_worst
## 1.323686e-01 2.542650e-01 2.721885e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1.146062e-01 2.900756e-01 8.394582e-02
apply(wisc.data, 2, FUN=sd)
## radius_mean texture_mean perimeter_mean
## 3.524049e+00 4.301036e+00 2.429898e+01
## area_mean smoothness_mean compactness_mean
## 3.519141e+02 1.406413e-02 5.281276e-02
## concavity_mean concave.points_mean symmetry_mean
## 7.971981e-02 3.880284e-02 2.741428e-02
## fractal_dimension_mean radius_se texture_se
## 7.060363e-03 2.773127e-01 5.516484e-01
## perimeter_se area_se smoothness_se
## 2.021855e+00 4.549101e+01 3.002518e-03
## compactness_se concavity_se concave.points_se
## 1.790818e-02 3.018606e-02 6.170285e-03
## symmetry_se fractal_dimension_se radius_worst
## 8.266372e-03 2.646071e-03 4.833242e+00
## texture_worst perimeter_worst area_worst
## 6.146258e+00 3.360254e+01 5.693570e+02
## smoothness_worst compactness_worst concavity_worst
## 2.283243e-02 1.573365e-01 2.086243e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 6.573234e-02 6.186747e-02 1.806127e-02
# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr <- prcomp(wisc.data, scale=TRUE)
# Look at summary of results
summary(wisc.pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Create a biplot of wisc.pr
biplot(wisc.pr)
# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC2")
# Repeat for components 1 and 3
plot(wisc.pr$x[, c(1, 3)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC3")
par(mfrow = c(1, 2))
# Calculate variability of each component
pr.var <- (wisc.pr$sdev)^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cummulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
par(mfrow = c(1, 1))
# Scale the wisc.data data: data.scaled
data.scaled <- scale(wisc.data)
# Calculate the (Euclidean) distances: data.dist
data.dist <- dist(data.scaled)
# Create a hierarchical clustering model: wisc.hclust
wisc.hclust <- hclust(data.dist, method="complete")
# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters <- cutree(wisc.hclust, k=4)
# Compare cluster membership to actual diagnoses
table(wisc.hclust.clusters, diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
# Create a k-means model on wisc.data: wisc.km
wisc.km <- kmeans(scale(wisc.data), centers=2, nstart=20)
# Compare k-means to actual diagnoses
table(wisc.km$cluster, diagnosis)
## diagnosis
## 0 1
## 1 14 175
## 2 343 37
# Compare k-means to hierarchical clustering
table(wisc.km$cluster, wisc.hclust.clusters)
## wisc.hclust.clusters
## 1 2 3 4
## 1 160 7 20 2
## 2 17 0 363 0
# Create a hierarchical clustering model: wisc.pr.hclust
wisc.pr.hclust <- hclust(dist(wisc.pr$x[, 1:7]), method = "complete")
# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters <- cutree(wisc.pr.hclust, k=4)
# Compare to actual diagnoses
table(wisc.pr.hclust.clusters, diagnosis)
## diagnosis
## wisc.pr.hclust.clusters 0 1
## 1 5 113
## 2 350 97
## 3 2 0
## 4 0 2
# Compare to k-means and hierarchical
table(wisc.km$cluster, diagnosis)
## diagnosis
## 0 1
## 1 14 175
## 2 343 37
table(wisc.hclust.clusters, diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
Chapter 1 - Regression Models: Fitting and Training
Max Kuhn, author of the caret package for supervised learning:
Out-of-sample error measurement - Zach Mayer, co-author of the caret package:
Cross-validation - improved approach of taking multiple test/train and averaging out-of-sample error rates:
Example code includes:
data(diamonds, package="ggplot2")
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Fit lm model: model
model <- lm(price ~ ., data=diamonds)
# Predict on full data: p
p <- predict(model)
# Compute errors: error
error <- p - diamonds$price
# Calculate RMSE
sqrt(mean(error^2))
## [1] 1129.843
# Shuffle row indices: rows
rows <- sample(nrow(diamonds), replace=FALSE)
# Randomly order data
diamonds <- diamonds[rows, ]
# Determine row to split on: split
split <- round(nrow(diamonds) * 0.8)
# Create train
train <- diamonds[1:split, ]
# Create test
test <- diamonds[-(1:split), ]
# Fit lm model on train: model
model <- lm(price ~ ., data=train)
# Predict on test: p
p <- predict(model, newdata=test)
# Compute errors: error
error <- p - test$price
# Calculate RMSE
sqrt(mean(error^2))
## [1] 1119.8
# Fit lm model using 10-fold CV: model
model <- caret::train(
price ~ ., data=diamonds,
method = "lm",
trControl = caret::trainControl(
method = "cv", number = 10,
verboseIter = TRUE
)
)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## + Fold01: intercept=TRUE
## - Fold01: intercept=TRUE
## + Fold02: intercept=TRUE
## - Fold02: intercept=TRUE
## + Fold03: intercept=TRUE
## - Fold03: intercept=TRUE
## + Fold04: intercept=TRUE
## - Fold04: intercept=TRUE
## + Fold05: intercept=TRUE
## - Fold05: intercept=TRUE
## + Fold06: intercept=TRUE
## - Fold06: intercept=TRUE
## + Fold07: intercept=TRUE
## - Fold07: intercept=TRUE
## + Fold08: intercept=TRUE
## - Fold08: intercept=TRUE
## + Fold09: intercept=TRUE
## - Fold09: intercept=TRUE
## + Fold10: intercept=TRUE
## - Fold10: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 53940 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 48546, 48545, 48545, 48545, 48546, 48546, ...
## Resampling results:
##
## RMSE Rsquared
## 1130.963 0.919703
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
data(BostonHousing, package="mlbench")
Boston <- BostonHousing
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Fit lm model using 5-fold CV: model
model <- caret::train(
medv ~ ., data=Boston,
method = "lm",
trControl = caret::trainControl(
method = "cv", number = 5,
verboseIter = TRUE
)
)
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 407, 404, 404, 404
## Resampling results:
##
## RMSE Rsquared
## 4.904787 0.7223226
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Fit lm model using 5 x 5-fold CV: model
model <- train(
medv ~ ., Boston,
method = "lm",
trControl = trainControl(
method = "cv", number = 5,
repeats = 5, verboseIter = TRUE
)
)
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 404, 406, 404, 405
## Resampling results:
##
## RMSE Rsquared
## 4.885857 0.7205083
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Predict on full Boston dataset
predict(model, newdata=Boston)
## 1 2 3 4 5 6
## 30.0038434 25.0255624 30.5675967 28.6070365 27.9435242 25.2562845
## 7 8 9 10 11 12
## 23.0018083 19.5359884 11.5236369 18.9202621 18.9994965 21.5867957
## 13 14 15 16 17 18
## 20.9065215 19.5529028 19.2834821 19.2974832 20.5275098 16.9114013
## 19 20 21 22 23 24
## 16.1780111 18.4061360 12.5238575 17.6710367 15.8328813 13.8062853
## 25 26 27 28 29 30
## 15.6783383 13.3866856 15.4639765 14.7084743 19.5473729 20.8764282
## 31 32 33 34 35 36
## 11.4551176 18.0592329 8.8110574 14.2827581 13.7067589 23.8146353
## 37 38 39 40 41 42
## 22.3419371 23.1089114 22.9150261 31.3576257 34.2151023 28.0205641
## 43 44 45 46 47 48
## 25.2038663 24.6097927 22.9414918 22.0966982 20.4232003 18.0365509
## 49 50 51 52 53 54
## 9.1065538 17.2060775 21.2815254 23.9722228 27.6558508 24.0490181
## 55 56 57 58 59 60
## 15.3618477 31.1526495 24.8568698 33.1091981 21.7753799 21.0849356
## 61 62 63 64 65 66
## 17.8725804 18.5111021 23.9874286 22.5540887 23.3730864 30.3614836
## 67 68 69 70 71 72
## 25.5305651 21.1133856 17.4215379 20.7848363 25.2014886 21.7426577
## 73 74 75 76 77 78
## 24.5574496 24.0429571 25.5049972 23.9669302 22.9454540 23.3569982
## 79 80 81 82 83 84
## 21.2619827 22.4281737 28.4057697 26.9948609 26.0357630 25.0587348
## 85 86 87 88 89 90
## 24.7845667 27.7904920 22.1685342 25.8927642 30.6746183 30.8311062
## 91 92 93 94 95 96
## 27.1190194 27.4126673 28.9412276 29.0810555 27.0397736 28.6245995
## 97 98 99 100 101 102
## 24.7274498 35.7815952 35.1145459 32.2510280 24.5802202 25.5941347
## 103 104 105 106 107 108
## 19.7901368 20.3116713 21.4348259 18.5399401 17.1875599 20.7504903
## 109 110 111 112 113 114
## 22.6482911 19.7720367 20.6496586 26.5258674 20.7732364 20.7154831
## 115 116 117 118 119 120
## 25.1720888 20.4302559 23.3772463 23.6904326 20.3357836 20.7918087
## 121 122 123 124 125 126
## 21.9163207 22.4710778 20.5573856 16.3666198 20.5609982 22.4817845
## 127 128 129 130 131 132
## 14.6170663 15.1787668 18.9386859 14.0557329 20.0352740 19.4101340
## 133 134 135 136 137 138
## 20.0619157 15.7580767 13.2564524 17.2627773 15.8784188 19.3616395
## 139 140 141 142 143 144
## 13.8148390 16.4488147 13.5714193 3.9888551 14.5949548 12.1488148
## 145 146 147 148 149 150
## 8.7282236 12.0358534 15.8208206 8.5149902 9.7184414 14.8045137
## 151 152 153 154 155 156
## 20.8385815 18.3010117 20.1228256 17.2860189 22.3660023 20.1037592
## 157 158 159 160 161 162
## 13.6212589 33.2598270 29.0301727 25.5675277 32.7082767 36.7746701
## 163 164 165 166 167 168
## 40.5576584 41.8472817 24.7886738 25.3788924 37.2034745 23.0874875
## 169 170 171 172 173 174
## 26.4027396 26.6538211 22.5551466 24.2908281 22.9765722 29.0719431
## 175 176 177 178 179 180
## 26.5219434 30.7220906 25.6166931 29.1374098 31.4357197 32.9223157
## 181 182 183 184 185 186
## 34.7244046 27.7655211 33.8878732 30.9923804 22.7182001 24.7664781
## 187 188 189 190 191 192
## 35.8849723 33.4247672 32.4119915 34.5150995 30.7610949 30.2893414
## 193 194 195 196 197 198
## 32.9191871 32.1126077 31.5587100 40.8455572 36.1277008 32.6692081
## 199 200 201 202 203 204
## 34.7046912 30.0934516 30.6439391 29.2871950 37.0714839 42.0319312
## 205 206 207 208 209 210
## 43.1894984 22.6903480 23.6828471 17.8544721 23.4942899 17.0058772
## 211 212 213 214 215 216
## 22.3925110 17.0604275 22.7389292 25.2194255 11.1191674 24.5104915
## 217 218 219 220 221 222
## 26.6033477 28.3551871 24.9152546 29.6865277 33.1841975 23.7745666
## 223 224 225 226 227 228
## 32.1405196 29.7458199 38.3710245 39.8146187 37.5860575 32.3995325
## 229 230 231 232 233 234
## 35.4566524 31.2341151 24.4844923 33.2883729 38.0481048 37.1632863
## 235 236 237 238 239 240
## 31.7138352 25.2670557 30.1001074 32.7198716 28.4271706 28.4294068
## 241 242 243 244 245 246
## 27.2937594 23.7426248 24.1200789 27.4020841 16.3285756 13.3989126
## 247 248 249 250 251 252
## 20.0163878 19.8618443 21.2883131 24.0798915 24.2063355 25.0421582
## 253 254 255 256 257 258
## 24.9196401 29.9456337 23.9722832 21.6958089 37.5110924 43.3023904
## 259 260 261 262 263 264
## 36.4836142 34.9898859 34.8121151 37.1663133 40.9892850 34.4463409
## 265 266 267 268 269 270
## 35.8339755 28.2457430 31.2267359 40.8395575 39.3179239 25.7081791
## 271 272 273 274 275 276
## 22.3029553 27.2034097 28.5116947 35.4767660 36.1063916 33.7966827
## 277 278 279 280 281 282
## 35.6108586 34.8399338 30.3519266 35.3098070 38.7975697 34.3312319
## 283 284 285 286 287 288
## 40.3396307 44.6730834 31.5968909 27.3565923 20.1017415 27.0420667
## 289 290 291 292 293 294
## 27.2136458 26.9139584 33.4356331 34.4034963 31.8333982 25.8178324
## 295 296 297 298 299 300
## 24.4298235 28.4576434 27.3626700 19.5392876 29.1130984 31.9105461
## 301 302 303 304 305 306
## 30.7715945 28.9427587 28.8819102 32.7988723 33.2090546 30.7683179
## 307 308 309 310 311 312
## 35.5622686 32.7090512 28.6424424 23.5896583 18.5426690 26.8788984
## 313 314 315 316 317 318
## 23.2813398 25.5458025 25.4812006 20.5390990 17.6157257 18.3758169
## 319 320 321 322 323 324
## 24.2907028 21.3252904 24.8868224 24.8693728 22.8695245 19.4512379
## 325 326 327 328 329 330
## 25.1178340 24.6678691 23.6807618 19.3408962 21.1741811 24.2524907
## 331 332 333 334 335 336
## 21.5926089 19.9844661 23.3388800 22.1406069 21.5550993 20.6187291
## 337 338 339 340 341 342
## 20.1609718 19.2849039 22.1667232 21.2496577 21.4293931 30.3278880
## 343 344 345 346 347 348
## 22.0473498 27.7064791 28.5479412 16.5450112 14.7835964 25.2738008
## 349 350 351 352 353 354
## 27.5420512 22.1483756 20.4594409 20.5460542 16.8806383 25.4025351
## 355 356 357 358 359 360
## 14.3248663 16.5948846 19.6370469 22.7180661 22.2021889 19.2054806
## 361 362 363 364 365 366
## 22.6661611 18.9319262 18.2284680 20.2315081 37.4944739 14.2819073
## 367 368 369 370 371 372
## 15.5428625 10.8316232 23.8007290 32.6440736 34.6068404 24.9433133
## 373 374 375 376 377 378
## 25.9998091 6.1263250 0.7777981 25.3071306 17.7406106 20.2327441
## 379 380 381 382 383 384
## 15.8333130 16.8351259 14.3699483 18.4768283 13.4276828 13.0617751
## 385 386 387 388 389 390
## 3.2791812 8.0602217 6.1284220 5.6186481 6.4519857 14.2076474
## 391 392 393 394 395 396
## 17.2122518 17.2988727 9.8911664 20.2212419 17.9418118 20.3044578
## 397 398 399 400 401 402
## 19.2955908 16.3363278 6.5516232 10.8901678 11.8814587 17.8117451
## 403 404 405 406 407 408
## 18.2612659 12.9794878 7.3781636 8.2111586 8.0662619 19.9829479
## 409 410 411 412 413 414
## 13.7075637 19.8526845 15.2230830 16.9607198 1.7185181 11.8057839
## 415 416 417 418 419 420
## -4.2813107 9.5837674 13.3666081 6.8956236 6.1477985 14.6066179
## 421 422 423 424 425 426
## 19.6000267 18.1242748 18.5217713 13.1752861 14.6261762 9.9237498
## 427 428 429 430 431 432
## 16.3459065 14.0751943 14.2575624 13.0423479 18.1595569 18.6955435
## 433 434 435 436 437 438
## 21.5272830 17.0314186 15.9609044 13.3614161 14.5207938 8.8197601
## 439 440 441 442 443 444
## 4.8675110 13.0659131 12.7060970 17.2955806 18.7404850 18.0590103
## 445 446 447 448 449 450
## 11.5147468 11.9740036 17.6834462 18.1269524 17.5183465 17.2274251
## 451 452 453 454 455 456
## 16.5227163 19.4129110 18.5821524 22.4894479 15.2800013 15.8208934
## 457 458 459 460 461 462
## 12.6872558 12.8763379 17.1866853 18.5124761 19.0486053 20.1720893
## 463 464 465 466 467 468
## 19.7740732 22.4294077 20.3191185 17.8861625 14.3747852 16.9477685
## 469 470 471 472 473 474
## 16.9840576 18.5883840 20.1671944 22.9771803 22.4558073 25.5782463
## 475 476 477 478 479 480
## 16.3914763 16.1114628 20.5348160 11.5427274 19.2049630 21.8627639
## 481 482 483 484 485 486
## 23.4687887 27.0988732 28.5699430 21.0839878 19.4551620 22.2222591
## 487 488 489 490 491 492
## 19.6559196 21.3253610 11.8558372 8.2238669 3.6639967 13.7590854
## 493 494 495 496 497 498
## 15.9311855 20.6266205 20.6124941 16.8854196 14.0132079 19.1085414
## 499 500 501 502 503 504
## 21.2980517 18.4549884 20.4687085 23.5333405 22.3757189 27.6274261
## 505 506
## 26.1279668 22.3442123
Chapter 2 - Classification Models
Logistic regression on mlbench::Sonar - classification models for categorical outcomes:
Confusion matrix - predicted outcomes vs. actual reality:
Class probabilities and class predictions - can modify thresholds for declaring positive depending on desired specificity vs. sensitivity:
Receive Operator Criteria - looking at many confusion matrices is time-consuming and non-scientific/systematic:
Area Under the Curve (AUC) - models that are more random will closely follow the diagonal line, while perfect models hit the upper-left corner:
Example code includes:
data(Sonar, package="mlbench")
# Shuffle row indices: rows
rows <- sample(nrow(Sonar), replace=FALSE)
# Randomly order data: Sonar
Sonar <- Sonar[rows, ]
# Identify row to split on: split
split <- round(nrow(Sonar) * 0.6)
# Create train
train <- Sonar[1:split, ]
# Create test
test <- Sonar[-(1:split), ]
# Fit glm model: model
model <- glm(Class ~ ., family="binomial", data=train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predict on test: p
p <- predict(model, newdata=test, type="response")
# Calculate class probabilities: p_class
p_class <- ifelse(p > 0.5, "R", "M")
# Create confusion matrix
caret::confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 26 10
## R 18 29
##
## Accuracy : 0.6627
## 95% CI : (0.5505, 0.7628)
## No Information Rate : 0.5301
## P-Value [Acc > NIR] : 0.009914
##
## Kappa : 0.3306
## Mcnemar's Test P-Value : 0.185877
##
## Sensitivity : 0.5909
## Specificity : 0.7436
## Pos Pred Value : 0.7222
## Neg Pred Value : 0.6170
## Prevalence : 0.5301
## Detection Rate : 0.3133
## Detection Prevalence : 0.4337
## Balanced Accuracy : 0.6672
##
## 'Positive' Class : M
##
# Apply threshold of 0.9: p_class
p_class <- ifelse(p > 0.9, "R", "M")
# Create confusion matrix
caret::confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 29 10
## R 15 29
##
## Accuracy : 0.6988
## 95% CI : (0.5882, 0.7947)
## No Information Rate : 0.5301
## P-Value [Acc > NIR] : 0.001298
##
## Kappa : 0.3998
## Mcnemar's Test P-Value : 0.423711
##
## Sensitivity : 0.6591
## Specificity : 0.7436
## Pos Pred Value : 0.7436
## Neg Pred Value : 0.6591
## Prevalence : 0.5301
## Detection Rate : 0.3494
## Detection Prevalence : 0.4699
## Balanced Accuracy : 0.7013
##
## 'Positive' Class : M
##
# Apply threshold of 0.10: p_class
p_class <- ifelse(p > 0.1, "R", "M")
# Create confusion matrix
confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 26 10
## R 18 29
##
## Accuracy : 0.6627
## 95% CI : (0.5505, 0.7628)
## No Information Rate : 0.5301
## P-Value [Acc > NIR] : 0.009914
##
## Kappa : 0.3306
## Mcnemar's Test P-Value : 0.185877
##
## Sensitivity : 0.5909
## Specificity : 0.7436
## Pos Pred Value : 0.7222
## Neg Pred Value : 0.6170
## Prevalence : 0.5301
## Detection Rate : 0.3133
## Detection Prevalence : 0.4337
## Balanced Accuracy : 0.6672
##
## 'Positive' Class : M
##
# Predict on test: p
p <- predict(model, newdata=test, type="response")
# Make ROC curve
caTools::colAUC(p, test$Class, plotROC=TRUE)
## [,1]
## M vs. R 0.7246503
# Create trainControl object: myControl
myControl <- caret::trainControl(
method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
# Train glm with custom trainControl: model
model <- caret::train(Class ~ ., data=Sonar, method="glm", trControl=myControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
## + Fold01: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold01: parameter=none
## + Fold02: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold02: parameter=none
## + Fold03: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold03: parameter=none
## + Fold04: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold04: parameter=none
## + Fold05: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold05: parameter=none
## + Fold06: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold06: parameter=none
## + Fold07: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold07: parameter=none
## + Fold08: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold08: parameter=none
## + Fold09: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold09: parameter=none
## + Fold10: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print model to console
model
## Generalized Linear Model
##
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 187, 187, 187, 186, 188, 187, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7387879 0.7310606 0.7122222
Chapter 3 - Tuning Model Parameters
Random forests and wine - very robust against over-fitting, and frequently yield very accurate non-linear models:
Explore a wider model space - random forests require tuning (hyper-parameters):
Custom tuning grids - further customization of the tuneGrid data frame (most flexible, complete control of grid-search exploration):
Introducing glmnet - extension of generalized linear model (glm) with built-in variable selection:
Custom tuning grids with glmnet - ability to tune on both alpha and lambda:
Example code includes:
redWine <- read.csv("redWine.csv", sep=";")
str(redWine)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
whiteWine <- read.csv("whiteWine.csv", sep=";")
str(whiteWine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
nRed <- 24
nWhite <- 76
wine <- rbind(redWine[sample(1:nrow(redWine), nRed, replace=FALSE), ],
whiteWine[sample(1:nrow(whiteWine), nWhite, replace=FALSE), ]
)
wine$color <- factor(c(rep("red", nRed), rep("white", nWhite)),
levels=c("red", "white")
)
str(wine)
## 'data.frame': 100 obs. of 13 variables:
## $ fixed.acidity : num 7.2 7.9 6.8 4.7 8.5 7.2 7.6 9.2 7.4 6.6 ...
## $ volatile.acidity : num 0.33 0.3 0.36 0.6 0.66 0.38 0.68 0.43 0.6 0.895 ...
## $ citric.acid : num 0.33 0.68 0.32 0.17 0.2 0.3 0.02 0.52 0.26 0.04 ...
## $ residual.sugar : num 1.7 8.3 1.8 2.3 2.1 1.8 1.3 2.3 7.3 2.3 ...
## $ chlorides : num 0.061 0.05 0.067 0.058 0.097 0.073 0.072 0.083 0.07 0.068 ...
## $ free.sulfur.dioxide : num 3 37.5 4 17 23 31 9 14 36 7 ...
## $ total.sulfur.dioxide: num 13 278 8 106 113 70 20 23 121 13 ...
## $ density : num 0.996 0.993 0.993 0.993 0.997 ...
## $ pH : num 3.23 3.01 3.36 3.85 3.13 3.42 3.17 3.35 3.37 3.53 ...
## $ sulphates : num 1.1 0.51 0.55 0.6 0.48 0.59 1.08 0.61 0.49 0.58 ...
## $ alcohol : num 10 12.3 12.8 12.9 9.2 9.5 9.2 11.3 9.4 10.8 ...
## $ quality : int 8 7 7 6 5 6 4 6 5 6 ...
## $ color : Factor w/ 2 levels "red","white": 1 1 1 1 1 1 1 1 1 1 ...
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneLength = 1,
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## Loading required package: e1071
## Loading required package: ranger
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold5: mtry=3
## - Fold5: mtry=3
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 79, 80, 81, 80, 80
## Resampling results:
##
## RMSE Rsquared
## 0.9085138 0.2127601
##
## Tuning parameter 'mtry' was held constant at a value of 3
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneLength = 3,
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry= 7
## - Fold1: mtry= 7
## + Fold1: mtry=12
## - Fold1: mtry=12
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry= 7
## - Fold2: mtry= 7
## + Fold2: mtry=12
## - Fold2: mtry=12
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry= 7
## - Fold3: mtry= 7
## + Fold3: mtry=12
## - Fold3: mtry=12
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry= 7
## - Fold4: mtry= 7
## + Fold4: mtry=12
## - Fold4: mtry=12
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry= 7
## - Fold5: mtry= 7
## + Fold5: mtry=12
## - Fold5: mtry=12
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 80, 80, 80, 80
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.8937233 0.2450648
## 7 0.8979628 0.2446744
## 12 0.9186630 0.2179967
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Plot model
plot(model)
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneGrid = data.frame(mtry=c(2, 3, 7)),
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## + Fold1: mtry=2
## - Fold1: mtry=2
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold1: mtry=7
## - Fold1: mtry=7
## + Fold2: mtry=2
## - Fold2: mtry=2
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold2: mtry=7
## - Fold2: mtry=7
## + Fold3: mtry=2
## - Fold3: mtry=2
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold3: mtry=7
## - Fold3: mtry=7
## + Fold4: mtry=2
## - Fold4: mtry=2
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold4: mtry=7
## - Fold4: mtry=7
## + Fold5: mtry=2
## - Fold5: mtry=2
## + Fold5: mtry=3
## - Fold5: mtry=3
## + Fold5: mtry=7
## - Fold5: mtry=7
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 80, 80, 81, 79
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.8460142 0.3049359
## 3 0.8507520 0.2927151
## 7 0.8554196 0.2847809
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Plot model
plot(model)
# Create custom trainControl: myControl
myControl <- caret::trainControl(
method = "cv", number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
## DO NOT HAVE (AND CANNOT FIND) DATASET "overfit"
# Fit glmnet model: model
# model <- caret::train(
# y ~ ., data=overfit,
# method = "glmnet",
# trControl = myControl
# )
# Print model to console
# model
# Print maximum ROC statistic
# max(model$results$ROC)
# Train glmnet with custom trainControl and tuning: model
# model <- caret::train(
# y ~ ., data=overfit,
# tuneGrid = expand.grid(alpha=0:1, lambda=seq(0.0001, 1, length=100)),
# method = "glmnet",
# trControl = myControl
# )
# Print model to console
# model
# Print maximum ROC statistic
# max(model$results$ROC)
Chapter 4 - Pre-processing data
Median imputation - real-world data has missing values which pose problems for many machine learning algorithms:
KNN imputation addresses the concern that median imputation may miss patterns in the NA data:
Multiple pre-processing methods - can do much more than median imputation or kNN imputation:
Handling low-information predictors - some variables may contain very little information (e.g., variables with no/low variance):
Principal Components Analysis (PCA) is especially valuable for linear modelling:
Example code includes:
v1 <- c( 5, NA, NA, 6, 4, 8, 1, 2, NA, NA, 1, 2, 5, 1, NA, 7, 4, 4, 10, NA, 7, 10, 3, 8, NA, 5, 3, 5, 2, 1, 3, 2, 10, 2, 3, 2, NA, 6, 5, NA, 6, 10, 6, 5, 10, NA, 3, 1, 4, NA, 9, 5, 10, 5, 10, 10, 8, 8, 5, 9, NA, 1, NA, 6, 1, 10, 4, 5, 8, 1, 5, 6, 1, 9, 10, 1, 1, 5, NA, 2, 2, 4, 5, 3, 3, 5, NA, 3, 4, 2, 1, 3, 4, 1, 2, 1, NA, 5, NA, 7, 10, 2, 4, NA, 10, 7, NA, 1, 1, 6, 1, 8, NA, 10, 3, 1, NA, 4, 1, 3, 1, 4, 10, 5, 5, 1, 7, 3, 8, NA, 5, 2, 5, NA, 3, 5, 4, 3, 4, 1, 3, 2, 9, 1, NA, 1, 3, 1, 3, 8, 1, 7, NA, NA, 1, 5, 1, 2, 1, 9, 10, NA, 3, 1, 5, 4, 5, 10, 3, 1, 3, 1, 1, 6, 8, 5, 2, 5, NA, 5, 1, 1, 6, 5, NA, 2, 1, 10, 5, 1, NA, 7, 5, 1, NA, 4, 8, 5, NA, 3, NA, 10, 1, 5, 1, 5, 10, 1, 1, 5, 8, 8, 1, 10, 10, 8, 1, 1, 6, 6, 1, 10, NA, 7, 10, 1, 10, 8, 1, 10, 7, 6, 8, NA, 3, 3, 10, 9, 8, 10, NA, 3, NA, 1, NA, 5, 8, 8, 4, 3, 1, 10, 6, 6, 9, 5, NA, 3, NA, 5, 10, 5, 8, NA, 7, 5, 10, NA, 10, 1, 8, 5, 3, 7, 3, 3, NA, NA, 1, 10, 3, 2, NA, 10, 7, 8, 10, 3, 6, 5, 1, 1, 8, NA, 1, 5, NA, 5, 8, NA, 8, 1, 10, 1, 8, NA, 1, 1, 7, 3, 2, 1, NA, 1, 1, 4, NA, 6, 1, 4, NA, 3, 3, NA, 1, 3, 10, NA, 8, 10, 10, NA, 5, 5, 8, 1, 6, 1, 1, 8, 10, 1, 2, 1, 7, 1, 5, 1, 3, 4, 5, 2, NA, 2, 1, 4, 5, 8, 8, 10, 6, NA, 3, 4, 2, 2, 6, 5, 1, 1, NA, 1, 4, 5, NA, 1, 1, NA, 3, NA, 1, 10, 3, 2, 2, 3, 7, NA, 2, 5, 1, 10, 3, 1, 1, 3, 3, NA, 3, NA, 3, 3, 5, 3, 1, 1, 4, 1, 2, NA, 1, 1, 10, 5, 8, 3, 8, 1, 5, 2, 3, 10, 4, 5, NA, 9, 5, NA, 1, 2, 1, 5, 5, 3, 6, 10, 10, NA, 4, NA, NA, 5, 1, 1, 5, NA, 1, 5, 1, 5, 4, 5, 3, 4, 2, 10, 10, 8, 5, 5, NA, 3, 6, 4, NA, 10, 10, 6, 4, 1, 3, 6, 6, 4, 5, 3, 4, 4, 5, 4, 5, 5, 9, 8, 5, NA, 3, 10, 3, 6 )
breast_cancer_x <- data.frame( X1 = c( v1, 1, NA, 4, NA, 5, NA, 1, 4, 4, 4, 6, 4, 4, 4, 1, 3, 8, 1, NA, 2, 1, 5, 5, 3, 6, 4, NA, NA, NA, 4, 1, 4, 10, 7, NA, 3, 4, NA, 6, 4, 7, NA, NA, 3, 2, 1, 5, NA, NA, 6, NA, 3, 5, 4, 2, 5, 6, NA, 3, 7, 3, 1, 3, 4, 3, 4, NA, 5, NA, 5, 5, 5, 1, 3, NA, 5, 3, 4, 8, 10, 8, 7, 3, 1, 10, 5, 5, NA, 1, 1, 5, 5, 6, NA, 5, 1, 8, 5, 9, 5, 4, 2, 10, 5, 4, 5, 4, 5, 3, 5, 3, 1, 4, NA, 5, 10, 4, 1, 5, 5, 10, NA, 8, NA, 2, 4, 3, NA, 4, 5, NA, 6, 7, 1, 5, 3, 4, 2, 2, 4, 6, 5, 1, NA, 3, NA, 10, 4, 4, 5, 4, NA, NA, 1, NA, 3, 1, 1, 5, 3, 3, 1, 5, 4, NA, 3, 5, 5, 7, 1, 1, 4, 1, 1, NA, NA, 5, NA, NA, 5, 3, 3, 2, NA, NA, 4, 1, 5, 1, 2, 10, 5, 5, 1, NA, 1, 1, 3, NA, 1, 1, 5, 3, 3, 3, 2, 5, 4, NA ))
v1 <- c( NA, 4, NA, 8, 1, 10, 1, 1, 1, 2, 1, 1, NA, 1, 7, 4, 1, 1, NA, 1, 3, 5, 1, 4, 1, 2, 2, 1, NA, 1, 1, 1, 7, NA, 1, NA, 10, 2, NA, 5, NA, 4, 10, 6, NA, 1, 7, 1, NA, NA, NA, 3, 3, 5, 5, 6, 10, 2, 2, 5, 3, 1, 10, 3, 1, 4, 1, 3, NA, 1, NA, 10, NA, 4, 6, 1, 1, 3, 1, NA, 2, 1, 2, 1, 5, 10, NA, 6, NA, 1, NA, NA, 1, 1, 1, 1, 1, 1, 6, 5, 3, 3, NA, 2, 10, 3, 10, 6, 1, 5, 3, 6, 3, 10, NA, NA, 3, NA, 1, 2, NA, NA, 10, 3, 4, 1, 5, 1, 3, 1, 1, 1, 10, NA, 1, NA, 1, 1, 1, 1, 1, 1, 5, NA, 1, 1, 4, NA, NA, 8, 1, 2, 10, NA, 1, 5, 2, 1, NA, 9, 7, 1, 1, 1, 1, 1, 6, 8, 1, 1, NA, NA, NA, 10, NA, 8, 1, 10, 1, 3, NA, 1, 1, 8, 7, 1, 5, 5, 8, 2, NA, 5, NA, 1, 1, 1, 4, 1, 1, NA, 7, 8, 1, 1, 1, NA, 10, 1, 1, 1, 10, 10, 1, 10, NA, 7, NA, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 10, 1, 1, 6, 1, 5, 5, 1, 5, 9, 1, 10, 4, 8, NA, 4, NA, 1, 8, 8, 10, 4, 1, 1, 1, 1, 1, 1, 10, 4, 1, 1, 2, NA, 3, 10, 10, 6, 1, NA, 1, 7, NA, 10, NA, 4, NA, 1, 10, 3, 8, 1, 4, 1, NA, 2, 1, 1, 1, 1, NA, 5, 1, NA, 4, 4, 4, 10, 10, 1, NA, 6, 1, 1, 8, 4, 1, 5, 3, NA, 2, 1, 4, 1, 10, 1, NA, 8, NA, 1, 8, 1, 1, 1, NA, 1, 1, 6, 5, 8, NA, 4, 6, 1, 1, 4, 1, 2, NA, 1, NA, 4, 4, 1, 2, 4, 6, 1, 5, 1, 1, 5, 3, 1, NA, NA, 6, 1, NA, 1, 4, 2, 1, 1, 4, 7, 1, 1, NA, 10, 10, 3, 10, 10, 2, NA, 1, 1, 10, 8, NA, 1, 3, NA, 1, 1, 1, 1, 1, 1, 1, 3, 1, 6, NA, 1, 1, 3, 6, 3, 1, 1, 1, 8, 1, NA, 2, 1, 1, 1, 2, 2, NA, 1, 3, 1, NA, 1, 2, 1, 3, 1, 1, 1, 10, 1, 5, 3, 7, 1, 2, 3, 2, 10, 3, 1, 1, NA, NA, 7, 1, 1, 3, NA, 1, NA, 9, 8, 10, 1, NA, NA, NA, 2, 1, 1, NA, 1, NA, 1, 1, 7, 1, 1, 1, NA, 3, NA, 6, 8, 1, 1, 1, 1, 1, 1, 1, 9, 6, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 2, NA, NA, NA, 10, 7, 1, 1, 1, 10, NA, NA, 1, 8, NA, 10, NA, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 10, 1, NA, 1, 1, 1, 1, 1, 6, 10, 1, 1, 1, 7, 1, 1, 4, 5, NA, 1, 1, NA, NA, 1, 4, 2, NA, 1, 1, 1, NA, 1, NA, NA, 1, 1, NA, 1, 1, 1, 10, 1, 1, 8, 1, 1, 2, 4, 1, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 7, NA, 1, 4, 10, NA, 6, 1, 1, 9, 1, 1, 1, 1, NA, 1, 7, 10, 1, NA, 1, 10, 1, 8, 1, 10, 5, NA, NA, 8, NA, 1, 1, 1, NA, 1, 1, 1, 4, 3, 5, 1, 1, 10, 1, 4, 10, 10, 3, 1, 1, 1, NA, 1, 1, NA, 3, 1, 1, 1, 1, 6, 1, 1, NA, NA, 1, 1, 7, 1, 1, 10, 2, 1, 1, 1, NA, 1, 1, 1, NA, NA, 1, 10, 1, 1, 2, 1, 1, NA, 1, 1, 4, NA, 1, 1, 1, NA, 1, 1, 1, 2, 1, 7, 10, NA, 2, 1, 3, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 1, 1, 10, 8, 8 ))
v1 <- c( 1, 4, 1, 8, 1, 10, NA, 2, 1, 1, 1, 1, 3, NA, 5, 6, 1, 1, 7, 1, 2, 5, 1, 5, 1, 3, 1, NA, 1, 3, NA, 1, 7, 1, 2, 1, 10, 1, 4, 3, NA, NA, NA, 5, NA, 1, 7, 1, 1, 7, 8, 3, NA, 5, 5, 6, 10, 4, 3, 5, 5, 1, 10, 4, NA, 2, 1, 4, 8, 1, 3, 2, 3, 5, 4, 2, 4, 1, 1, 1, 2, 1, 1, 1, 7, 6, 6, 6, 1, 1, NA, NA, 1, 1, 1, 1, 1, 1, NA, 6, 5, 4, 2, 3, 10, 4, NA, 8, 1, 4, NA, 4, 3, 10, 2, 1, 3, 5, 1, 1, NA, 1, 10, 5, NA, 1, 3, 1, 5, 1, 3, 1, 8, 1, 1, 1, 1, NA, 2, 1, NA, 1, 5, 1, 1, NA, 5, 1, 1, 7, 1, 4, 8, 1, 1, 5, 2, 1, 2, 10, 7, NA, 1, 1, 1, 1, 7, 10, 1, 1, 1, 1, NA, 10, 5, NA, 1, 10, 1, 3, 1, NA, 1, 8, 6, NA, 8, 6, 4, 3, NA, 10, 1, 1, 1, 1, 4, 1, 1, 1, 7, 8, 1, 1, 1, NA, 9, 1, 1, 1, NA, 8, 1, 10, 10, 8, 1, NA, 7, 3, 1, NA, 1, 6, 5, 1, 7, 9, 1, NA, NA, NA, 6, 5, 2, 4, 8, 8, 10, 3, 3, 1, 1, 1, 1, 1, 10, 4, 1, 1, 2, 4, 3, 10, 10, 6, NA, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 1, NA, 8, 10, 9, NA, 4, NA, 6, 5, 8, 1, 7, NA, 5, 4, 1, 3, 1, 1, 1, 7, NA, 1, 3, 6, 5, 10, 10, 1, 3, 6, NA, 1, 8, 4, 1, 7, 4, 3, 1, 2, 10, 1, 10, 1, 4, 4, 1, NA, 7, 1, 1, 1, 4, 1, 1, 5, 5, 7, 1, 4, 3, 1, 1, 6, 1, 2, 1, 1, 3, 6, NA, 1, NA, 6, 7, 1, 5, 1, 1, 5, 3, NA, 1, 1, 4, 1, 2, 1, 4, 3, NA, 1, 5, 10, 1, 1, 3, NA, 5, 5, NA, NA, 2, 4, NA, 1, NA, 8, 3, 3, NA, 3, 2, 1, 2, NA, NA, 1, 1, 4, 1, 3, 2, 1, 1, 2, 6, NA, 1, 1, 1, 7, 1, NA, 3, 1, 1, 1, 1, 3, 8, NA, 3, 1, 1, NA, NA, 1, 2, 2, 1, 1, 10, 2, 6, 2, 8, NA, 2, 1, 2, 10, 3, 3, 1, 10, 6, 8, 1, 1, 1, 1, 1, 2, 7, 10, NA, 1, NA, 1, 3, 2, 1, 1, NA, 1, 1, NA, 1, 9, NA, 1, 1, 5, 1, 2, 5, 9, 2, NA, 1, 1, 1, NA, NA, 8, 6, 6, 1, NA, 1, 1, 1, 1, 1, 1, 2, NA, 1, 7, 1, 2, 10, 8, 2, NA, 1, 10, 4, NA, 1, 9, 1, NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 10, 1, NA, 1, 1, 1, 1, 1, 7, 4, NA, NA, NA, 8, 1, 1, NA, 6, 1, 1, 1, 1, 3, 1, 4, 2, 1, 1, NA, 3, 1, 2, NA, 1, 1, 1, 1, NA, 3, 1, 10, 1, 1, 3, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, NA, 1, 10, 2, 1, NA, 8, 4, 10, 1, NA, 7, 2, NA, 1, 1, 1, 2, 10, 5, 1, 1, 1, 10, 1, NA, 1, 8, NA, 4, 2, 6, NA, 2, 3, 1, NA, NA, 1, 1, 6, NA, 10, 1, 1, 10, 1, 3, 10, 10, 1, 1, 3, 1, 1, 1, NA, 1, NA, 2, 1, 1, 3, NA, NA, 1, 1, 3, 1, 1, 4, 1, 4, 7, 4, 1, NA, 1, 1, 1, 1, 1, 1, 2, NA, 10, 1, 1, NA, 1, 1, 1, 1, 1, 5, 8, 1, NA, 1, 3, 3, 1, NA, 2, NA, 4, 10, 7, 1, 1, 2, NA, 4, 2, 1, 1, NA, 10, 10, NA, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, NA, 1, 1, 10, 6, 8 ))
v1 <- c( 1, NA, 1, NA, 3, 8, NA, 1, NA, 1, 1, 1, 3, 1, 10, 4, 1, 1, 6, 1, 10, NA, 1, NA, 1, 4, 1, 1, 1, 1, 1, NA, 3, 2, 1, 1, 8, 1, 9, 3, NA, 1, 2, NA, 4, 1, 4, 1, 3, NA, 1, NA, 2, 8, 6, NA, 1, 1, 1, NA, 5, 1, 1, NA, 1, 1, NA, 1, 3, NA, 1, 8, 2, 10, 1, 1, NA, 2, 1, 1, 1, 2, 1, 1, 8, 1, 4, 6, 1, 2, 1, 2, NA, 1, 1, 1, NA, 1, NA, 10, 1, NA, 1, 1, NA, 4, 8, 10, NA, 4, 2, 3, 10, 3, NA, 1, 1, 10, 1, NA, 2, 1, NA, 1, 7, 1, 7, 1, 4, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 3, NA, 1, 1, 6, 1, 1, 6, 1, 1, 1, NA, 4, 1, 1, 2, 1, 1, 8, 10, 1, 2, 1, 1, NA, 10, 4, 7, 1, 3, NA, 3, 1, 1, NA, 8, 4, 1, NA, 10, 10, 1, 8, 10, 1, 1, NA, 1, NA, 4, 1, 1, 5, 4, 1, 1, 1, 9, 3, 1, 1, 1, 10, 8, 1, NA, 10, 7, 1, 1, 7, 1, NA, 3 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 3, 3, 6, 1, 4, 5, 1, 3, 4, 5, 3, 5, 1, 1, NA, 5, 8, 2, 3, 3, 1, 1, 1, 2, 8, 1, 1, 1, NA, 10, 5, NA, 1, 2, 1, 1, 1, 1, 10, 6, 4, 10, NA, 1, 3, 2, 2, 1, 1, 1, 2, 1, 1, NA, NA, 1, NA, 3, NA, NA, 10, 1, NA, 10, 10, 1, 1, 8, 1, 1, 1, 6, 1, 8, 3, 1, 1, 6, 5, 1, 7, 1, NA, 4, 1, 1, 6, NA, 1, 1, 10, 1, NA, 6, NA, 8, NA, 4, 2, 1, 1, 10, NA, 1, 1, NA, 2, 4, 2, NA, NA, 6, 3, 1, 8, NA, NA, 5, 1, 1, 1, 1, 8, NA, 2, NA, 10, NA, 3, 1, 3, 10, 1, 1, 1, 7, 3, 4, 10, 10, 1, 2, 1, 1, 10, 10, NA, 1, 1, 1, 1, 2, 1, 1, NA, 1, 4, 1, 1, 6, 2, 1, 1, 2, 3, 2, 1, NA, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, NA, 1, 1, NA, 2, 1, 1, 1, 6, NA, 2, 6, 5, 1, 2, 1, NA, 7, 1, NA, 1, 10, 1, 2, 1, 1, 1, 3, 1, 3, 5, 1, NA, 1, 3, 1, 10, 4, 3, 1, 6, 1, 1, 1, 1, 8, 3, 1, NA, NA, 1, 1, 8, 6, 1, 1, 3, 1, 3, 1, 1, 7, 2, 5, 1, 1, 1, 3, NA, NA, 1, 1, 1, 1, 1, 10, 1, 4, NA, 5, 1, 3, 1, 10, 10, 1, 1, 4, 1, 10, 10, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 10, NA, 1, 1, 1, 1, 1, 1, NA, 7, NA, NA, 2, 3, 1, 1, 4, 10, NA, NA, NA, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 10, 1, NA, 7, 1, 1, NA, 1, 1, 1, 2, 3, 1, NA, 1, 1, 1, 1, 1, NA, 1, 1, 1, 10, 4, 5, 1, 1, 3, 1, 1, 1, NA, 1, 1, 6, 5, 1, 6, 1, 10, 1, 9, NA, 5, 6, 5, 1, NA, 1, 1, 1, NA, NA, NA, 1, 1, 8, 8, 3, 2, 1, 10, 1, 10, 10, NA, 1, 1, 1, 1, 1, 1, 1, NA, 3, 3, 1, 2, NA, NA, 1, 1, 1, 1, NA, 1, 4, 1, 1, 8, NA, NA, 3, 3, 1, 1, 1, 1, NA, 1, 3, 10, 1, 2, 3, NA, 1, 1, 1, 1, 1, 7, 1, NA, 1, 1, NA, NA, 1, 2, 1, 1, 8, NA, 2, NA, 1, 1, 1, 1, 1, 1, 1, 10, NA, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 1, 1, NA, 1, 3, 4, 5 ))
v1 <- c( NA, 7, 2, NA, 2, 7, 2, 2, 2, 2, 1, 2, NA, 2, 7, 6, 2, 2, 4, 2, 5, 6, 2, 2, 2, NA, 1, NA, 2, 2, NA, 2, 8, 2, 2, 2, 6, NA, 2, NA, NA, 3, 8, 10, 8, 2, 4, 2, 2, 4, 2, 2, 3, NA, 8, 4, 3, NA, 6, NA, 3, 2, 10, 5, 2, NA, 2, 8, NA, 2, 2, 10, 2, 6, 3, 2, 2, 2, 2, NA, 1, 2, 2, 2, NA, 10, 5, 5, 2, 3, 2, 2, 2, NA, 2, 2, 2, NA, 10, 5, 10, 2, NA, 6, 10, NA, 2, NA, 2, 3, 2, 5, 2, 10, 2, 2, 2, 4, NA, NA, 2, 2, 10, 8, 9, NA, 4, 2, 5, 10, 2, 2, 8, 2, 3, NA, 2, 2, NA, 1, NA, 2, NA, 2, 2, 2, 6, 3, 8, 10, 1, 6, NA, 2, 2, NA, 2, 2, 3, 6, 5, 2, 2, 1, 2, NA, 8, NA, 2, 1, 2, NA, 2, 8, NA, 10, 2, 8, NA, NA, 1, 2, 2, NA, NA, 1, 5, 6, 5, 2, 6, 10, 2, 2, 2, 2, NA, 2, 2, NA, 5, 10, 2, 2, 2, 6, 7, 1, 1, 1, 5, NA, 2, 7, 3, 5, 2, NA, 6, 2 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, NA, NA, 1, 3, 3, 2, NA, 3, 1, 10, 3, 6, 3, 5, 3, 2, 8, 6, 6, 3, 2, 1, 2, 2, 2, 2, 5, 2, 2, 2, 2, NA, 3, 8, 10, 4, 2, 2, 2, 5, NA, 10, 5, NA, 10, 2, NA, 3, 3, NA, 3, 2, NA, 3, 2, 2, NA, 2, 2, 3, 2, 2, 4, 2, 2, 8, 10, 3, 4, 6, 2, 2, 2, NA, 2, 6, 4, 2, 5, 4, NA, 2, 9, 2, 3, 4, 2, 2, 4, 2, 3, 2, 10, 2, 1, NA, 5, 6, 5, 6, 5, 2, 2, NA, 2, 2, 2, 2, 6, 5, 2, 2, 2, 4, 3, 2, 4, 2, 1, 2, 2, 2, 2, 2, 10, 2, 3, 1, 5, 3, 2, 2, 7, NA, 2, 3, 3, NA, 8, 3, 10, 6, 4, 2, 2, 2, 8, 5, 2, 1, 3, 2, 2, 2, 2, 2, 2, 1, 3, 4, 2, 4, NA, 2, 2, 3, 2, 3, 2, NA, 2, 3, 2, NA, NA, 2, NA, 2, 2, NA, NA, 2, 2, 2, 2, 2, 2, 2, NA, NA, 2, 1, 8, 2, 3, 3, 10, 2, 2, 5, 2, 10, 2, 2, 2, 10, NA, 4, NA, 2, 2, 4, 2, 2, 5, 3, NA, 2, 2, 2, 4, 2, 2, 2, 3, NA, 2, 2, 1, 6, 1, 2, NA, 6, 3, 2, 5, 6, 2, 2, 2, NA, NA, 2, 2, 6, 4, NA, 2, 2, 1, 2, 1, 2, 2, NA, 2, 2, 2, 4, 1, 2, 10, NA, 2, 1, 1, 6, 3, 3, 2, 3, 1, 6, 4, 1, 1, 2, 2, NA, 2, 2, 2, 2, 2, 2, 7, 2, 2, NA, 2, 2, 2, NA, 3, 3, NA, 1, 2, 4, 3, 3, NA, 4, 2, 2, NA, NA, NA, 1, NA, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, 2, 2, 2, 4, 1, 1, 4, 2, NA, 2, 2, 2, 2, NA, 2, 2, 2, NA, 2, 2, 2, 2, 5, 2, 2, 6, 6, 8, 3, 2, 2, NA, 2, 2, 2, NA, 2, 2, NA, 4, NA, 3, 2, 6, 2, 6, 2, 4, 4, 3, 2, 4, 2, 2, 2, 2, 1, NA, 1, 2, 4, 5, 5, 2, 2, 10, 2, 3, 5, NA, 2, 1, 2, 2, NA, 2, 2, 2, 3, 2, NA, 1, 3, 7, NA, 2, 2, 2, 2, 2, 5, 2, 2, 7, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 10, 2, NA, NA, 2, 2, 2, 2, NA, 8, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 5, 5, 2, NA, 3, 2, 2, NA, 2, 2, 2, 5, 4, 2, NA, 2, NA, 2, 2, 2, 2, 2, 4, NA, 2, NA, 2, 7, 3, 4 ))
v1 <- c( 1, 10, NA, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, 1, 10, NA, 10, 7, 1, NA, 1, 7, 1, 1, 1, 1, 1, 1, NA, NA, 1, 1, 1, 1, 10, NA, NA, 3, NA, 1, 1, 1, 9, NA, 1, 8, 3, 4, 5, 8, 8, 5, 6, 1, 10, 2, 3, 2, 8, 2, 1, 2, 1, 10, 9, 1, 1, 2, 1, 10, NA, 2, 1, 1, 3, 1, 1, 1, 1, 2, 9, 4, 8, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 10, 5, 5, 1, 3, 1, 3, 10, 10, 1, 9, 2, 9, 10, 8, 3, 5, 2, NA, 3, NA, 1, 2, 10, 10, 7, 1, NA, NA, 10, NA, 1, 1, 10, 1, 1, 2, 1, 1, 1, NA, 1, 1, NA, 5, NA, NA, 8, 2, 1, 10, 1, 10, 5, 3, NA, 10, 1, 1, NA, 10, 10, 1, 1, 3, NA, 2, 10, 1, 1, 1, 1, 1, 1, 10, 10, 10, 1, 1, 1, NA, 1, 1, 1, 10, 10, 1, 8, NA, 8, NA, 8, 10, 1, NA, 1, 1, 7, 1, 1, 1, 10, 10, 1, 1, 1, 10, 5, 1, 1, 1, 10, 8, 1, 10, 10, 5, 1, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 4, NA, 1, 10, NA, 8, 10, 1, 10, 5, 1, NA, 7, 8, 1, NA, 1, NA, 10, 2, NA, 10, 2, NA, 1, 5, 1, NA, 10, 9, 1, NA, NA, 10, 10, 10, 8, 10, 1, 1, NA, 8, 10, 10, 10, 10, 3, 1, 10, 10, NA, NA, 10, 1, 10, 4, 1, NA, 1, 1, 1, 7, 1, 1, 10, NA, 10, 10, 10, 1, 5, 10, 1, 1, NA, NA, NA, 10, 5, NA, 1, NA, 4, 1, 10, 1, 10, 10, 1, 1, NA, NA, 1, 1, 1, 1, 1, NA, 10, 8, 1, 5, NA, NA, 1, 10, 1, 1, 10, 1, 4, NA, 8, 1, 1, 10, 10, 1, NA, 1, NA, 10, 10, NA, NA, 1, NA, 1, 1, 1, 1, 8, 1, 1, 3, 10, NA, 1, 3, 10, 4, 7, 10, 10, 3, 3, 1, 1, 10, NA, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, 1, 10, NA, 1, 1, 1, 10, 1, 1, 2, 1, 10, 1, 1, NA, NA, NA, 1, 1, 1, 9, 1, 1, 4, 1, 1, 1, NA, 2, 1, NA, NA, 4, NA, 10, 3, 10, 1, 2, 1, 3, 10, 1, NA, 1, 10, 1, 2, NA, 1, 1, 1, 1, 1, 8, 10, NA, 1, 1, 1, 10, 4, NA, 2, 1, 1, 1, 1, 1, 10, NA, 1, 1, 10, 1, 6, NA, NA, 1, 1, 1, NA, 1, 1, 1, 4, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 5, NA, 1, NA, 1, 10, 3, 4, 1, 10, 1, 10, 5, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 1, 5, 4, 1, 1, 1, 1, NA, NA, 10, 10, 1, 1, 1, 10, 1, 1, 5, 10, 1, 1, 1, NA, 1, 1, 10, 1, 1, 1, 1, NA, 1, 1, 1, NA, 2, 1, NA, 1, NA, 1, 10, 1, 1, NA, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, 3, 10, 5, 10, 10, 1, NA, NA, 1, 1, 1, 1, NA, NA, NA, 10, 1, NA, 1, 10, 1, 3, NA, 1, NA, 10, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 8, 1, 1, 10, 1, 10, 2, 10, 1, 1, 1, 1, NA, 1, 1, NA, 2, 1, 1, 1, 4, 6, 5, NA, 1, 1, 1, NA, 3, 1, 1, 1, 2, 1, 1, NA, 1, 1, NA, 1, 1, 1, 1, NA, 1, 4, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 1, NA, 1, NA, NA, 1, 1, 1, NA, 8, 1, 1, 1, 1, NA, 1, 1, 1, 1, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 1, 3, 4, 5 ))
v1 <- c( 3, 3, 3, 3, 3, NA, 3, 3, NA, 2, 3, 2, 4, 3, 5, NA, 2, 3, 4, 3, 5, 7, NA, 7, 3, 3, 2, NA, NA, 1, 2, 3, 7, 3, 2, 2, 8, 7, 5, 7, 7, 6, 7, 3, 8, NA, 4, 2, 3, NA, 2, 3, 4, 7, 7, 3, 3, 5, 5, NA, 4, NA, 3, 3, 2, 4, 3, 4, 8, 3, 2, 7, 7, 4, 3, 4, 2, 2, 3, 2, 7, 2, 3, 7, 7, 4, NA, 6, 3, 2, 3, 1, 3, NA, 3, NA, 1, NA, 2, 7, 3, 2, NA, 7, 8, 3, 4, 5, 2, 7, 5, 3, 7, NA, 3, 1, NA, NA, 1, NA, NA, 3, 5, 5, 8, 2, 7, NA, 1, 1, 2, 3, 3, 2, 2, 3, 2, 1, 2, 2, 1, 1, 4, 1, 2, 2, 4, 2, 5, 7, 3, NA, 8, NA, 1, NA, 2, 3, 1, NA, 5, 3, 3, 1, 3, 3, 3, 3, NA, 1, 1, 3, 2, 10, 6, 5, NA, 5, 3, 3, 3, 1, 3, 7, 5, 3, 7, 7, 9, 3, 7, 4, 2, 3, NA, 3, NA, 3, NA, 2, 7, 8, 3, NA, 3, NA, 3, 3, 3, 3, 8, 7, 3, 7, 10, NA, 2, 3, 8, 3, 3, 9, 2, NA, 7, 2, 8, 7, 3, 9, NA, 8, 4, 4, 3, NA, NA, 4, 3, 5, 2, 3, 3, 5, NA, 3, 7, NA, NA, 3, 1, 5, 3, 7, 3, 3, 1, 2, 3, 3, 5, NA, 7, 5, 5, 3, 4, 7, 8, 3, NA, 3, NA, 3, NA, 2, 2, NA, 3, 3, 3, NA, 5, 5, 3, NA, 4, 2, 5, 4, 1, 3, 6, 2, NA )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 7, 4, 2, 1, 7, 7, 3, 7, 3, 3, 3, 3, 3, 8, 5, 2, 1, 3, 1, NA, 4, 4, 8, 3, 7, 7, 3, NA, 4, 3, 2, 5, 2, 3, 7, 6, 3, NA, 4, NA, 1, 3, 3, 2, NA, NA, 3, 1, NA, NA, 1, 1, 1, 3, 7, 1, 3, 4, NA, NA, 2, 3, 7, 4, 3, 8, 5, NA, 2, 3, 2, NA, 8, 1, NA, 2, 1, 2, NA, 2, NA, 2, 2, 2, 3, 1, 7, NA, 1, 1, 1, 7, 3, 2, 2, 2, 7, 2, 1, 2, 2, NA, 1, 2, 1, 9, 1, 2, 1, 1, 2, 2, 2, 3, 2, 2, NA, 8, NA, 6, 3, 7, 2, 3, 1, 3, 8, 3, 2, NA, 10, NA, 5, 2, 2, 2, 3, 2, 1, 4, NA, 2, 1, 1, 1, 10, NA, 1, NA, 2, 1, 1, 1, 1, NA, 2, 1, 1, 10, 1, 1, 8, 10, 1, 1, 1, 1, 1, 1, 1, 7, 9, 7, 1, 2, 2, 1, NA, 1, 1, NA, 1, 1, 1, 7, 1, 1, 10, 9, NA, 1, 2, 8, 3, 4, 1, 7, 2, 6, 2, 2, 1, 1, 2, 2, NA, NA, 2, 3, 1, 1, NA, 1, 1, 1, NA, NA, 1, 2, 8, 9, 1, 2, 1, 9, 1, NA, 7, NA, 2, 1, NA, 3, 1, 2, 6, 2, 3, NA, 2, 3, 3, NA, 2, 2, 2, 1, 1, 2, 2, 2, 7, 1, 1, 7, 2, 3, 4, 2, 1, 4, NA, 1, NA, 2, 3, 3, 3, 2, 3, 10, NA, NA, 2, 10, 8, 9, NA, 2, 7, 3, NA, 2, 2, 3, 2, 7, 6, 1, 1, 1, 10, NA, 4, NA, 10, 7, 4, 1, 7, 2, 2, NA, NA, 1, 2, 2, 2, 8, NA, 7, 1, 1, 10, 1, 7, 8, 10, 2, 2, 2, NA, 1, 2, 2, 2, NA, 2, NA, 2, 1, 7, NA, 1, 1, 1, 2, 1, 5, NA, 1, 10, 2, 1, 1, NA, 2, 2, 1, 1, 2, 1, NA, 10, 2, 1, 2, 2, 2, NA, 2, NA, 3, 7, 1, 2, 3, NA, 2, 2, 1, 1, 3, 7, 7, 7, 3, 3, 1, 2, 1, 2, NA, 1, 1, 10, 5, 3, 1, 1, 1, 1, 2, 1, 1, NA, 4, 1, 2, 1, 1, 8, 10, 10 ))
v1 <- c( 1, NA, 1, 7, NA, 7, 1, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, 1, 1, 1, 4, 10, 1, 3, 1, 6, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 9, 1, 6, NA, 8, NA, 3, 1, 10, NA, 8, NA, 1, 8, 1, 4, 10, 3, 1, 6, NA, 4, 1, 1, 10, 1, 3, 9, 1, 3, 1, 9, 9, 2, NA, 8, 2, 8, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, NA, 10, 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 9, NA, 5, 1, 1, 8, 2, 1, NA, 3, 8, 3, 1, 3, 1, 1, NA, 2, 5, 1, 1, 1, 1, 3, 3, 10, NA, 5, 1, NA, NA, 1, 1, 6, 2, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, NA, 1, 1, 1, 8, NA, 1, 4, NA, 1, 1, 1, 1, 1, 1, 10, 7, 2, NA, 1, 1, 2, 10, 1, 1, 1, 1, 1, 1, NA, NA, 7, NA, 10, 1, 1, NA, 1, 1, 8, NA, NA, 10, 7, 10, NA, 10, 10, NA, 1, 1, 1, NA, 1, 1, NA, NA, 1, NA, NA, 1, 10, 5, NA, 1, NA, 10, 7, 1, 10, 6, 10, 1, 1, 10, 1, 1, 10, 1, NA, 9, 1, 9, 7, 1, 10, 6, 9, 3, 1, 6, 1, 8, 10, 10, NA, 3, 1, 1, 1, 1, 1, 8, 3, 6, 1, 1, 3, 5, 3 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 3, 6, 1, 1, 1, 4, 1, NA, 8, 5, 3, 2, 3, 1, 7, 1, 9, 1, 1, 3, 2, 1, NA, 1, NA, 3, 1, 1, 6, NA, 8, 7, 10, 1, 10, 10, 1, 1, 10, 3, NA, 4, 7, NA, 1, 7, 10, 1, NA, 1, 3, 10, 1, 1, 8, NA, NA, NA, 5, 1, 1, 9, 3, NA, 1, 3, 4, 1, 1, NA, 1, 3, 4, NA, NA, 1, 1, NA, NA, 3, 4, 1, 4, 1, 1, NA, 6, 1, NA, NA, NA, 1, 3, 3, 3, 6, 1, 1, 6, NA, 1, 2, 3, NA, 10, 5, 10, 1, NA, 1, 1, 1, NA, 10, 1, NA, NA, 1, 1, NA, NA, 1, 1, NA, 2, NA, 1, 8, 2, 1, 1, 2, 1, 1, 2, 2, 1, 9, 1, 1, 1, NA, 1, 1, NA, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, NA, 1, 1, NA, 1, 6, 5, 2, 1, 2, 1, NA, 2, 3, 1, 1, 10, 1, 10, 1, 1, 2, 2, 2, NA, NA, NA, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 10, 1, 1, NA, 7, 1, 1, 6, 10, 1, 1, 1, 1, 1, NA, 1, 10, 7, 6, 1, 1, 1, 1, 1, NA, 1, 1, NA, 1, 1, 5, 1, 1, 10, 10, 1, 1, 1, 1, 4, NA, 1, NA, NA, 5, 1, 1, NA, 1, NA, 1, 1, 1, 1, NA, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, NA, 10, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, NA, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 10, 1, 1, 8, 1, NA, 2, 1, NA, 8, 1, 1, 1, 1, 1, 1, NA, 1, 2, 10, 1, 1, 5, 3, NA, 10, 1, 1, 7, 1, 1, 1, 1, 1, 1, 5, 10, 1, 1, 1, 10, NA, 1, 1, 1, 6, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 10, 1, 8, 1, 1, NA, 1, 1, 5, 10, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 10, NA, 1, 10, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 10, 1, 1, NA, 2, 1, 1, 1, 1, 6, 2, 1, 1, 1, 1, 1, 1, 1, 1, NA, 10, 10, 4, 1, 1, 1, NA, 1, 1, NA, NA, 1, 10, NA, 2, 1, 1, 1, NA, 3, 1, 1, NA, 4, 1, 1, 1, 1, 10, NA, 4 ))
v1 <- c( 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, NA, 1, 1, 4, 1, 1, 1, NA, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 3, 1, 1, 1, NA, 1, 1, 1, 1, 2, 3, 1, 1, 2, NA, 1, 1, 2, 5, NA, 2, 7, NA, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 8, 1, 1, 10, NA, 1, NA, 1, NA, NA, 1, 1, 1, NA, NA, NA, 7, 10, 1, NA, NA, 1, NA, NA, 1, 1, 1, 1, 1, NA, NA, 4, 2, 1, 1, 1, 8, 7, 1, 1, NA, 3, 2, 1, 3, 1, 1, 1, 1, 8, 1, 1, 1, 1, 3, 1, 1, NA, 5, 1, NA, 1, 1, 1, 3, NA, 1, 1, 1, 1, 1, NA, 1, 1, 3, 1, NA, 1, 1, 1, 1, NA, 1, 3, 1, 1, NA, 1, NA, 1, 1, 6, 2, 1, NA, NA, NA, 1, 3, NA, NA, 1, NA, 1, 1, 7, 1, 1, NA, 3, 1, 1, NA, NA, 1, 1, 1, 1, 1, 10, 1, 1, NA, 3, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, 5, 1, 1, 1, 1, NA, 1, NA, 4, 1, 2, NA, 1, 2, 1, 1, 1, 1, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, NA, 1, 10, NA, NA, 2, 1, 1, NA, 1, 1, 1, 1, 1, 1, NA, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 3, 5, NA, 1, NA, 1, 2, 1, 8, 1, NA, 1, 1, NA, NA, 1, NA, 1, 1, 8, 1, 1, 1, 1, 2, 3, 10, 1, 1, 4, 1, 1, 1, 1, 1, 1, NA, 1, 1, 2, NA, 1, NA, 1, 1, NA, 1, NA, NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, NA, 1, 1, NA, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, NA, 1, NA, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, NA, 8, 3, 3, 10, NA, NA, 1, 1, 1, 7, 3, 1, 1, 1, 1, 1, 1, NA, 1, NA, 1, 1, 1, 1, 4, 1, 1, 1, 3, NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, NA, 1, 1, 1, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, NA, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 10, 1, NA, 1, 1, 5, NA, 1, NA, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, 1, 1, 1, 1, NA, NA, 1, NA, 1, 1, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, 1, NA, 2, 1, 1, 1, 2, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, NA, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 2, 3, 1, NA, 1, 1, 2, 1, 10, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, 1, 1, 1, 3, 1, 1, NA, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 10, 1, 1, 1, 1, NA, 1, NA, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, 3, 1, 1, 1, NA, 1, 1, 1, 8, 1, 1, 1, 2, 1, 1, 2, NA, 1 ))
names(breast_cancer_x) <- c( 'Cl.thickness', 'Cell.size', 'Cell.shape', 'Marg.adhesion',
'Epith.c.size', 'Bare.nuclei', 'Bl.cromatin', 'Normal.nucleoli', 'Mitoses'
)
str(breast_cancer_x)
## 'data.frame': 699 obs. of 9 variables:
## $ Cl.thickness : num 5 NA NA 6 4 8 1 2 NA NA ...
## $ Cell.size : num NA 4 NA 8 1 10 1 1 1 2 ...
## $ Cell.shape : num 1 4 1 8 1 10 NA 2 1 1 ...
## $ Marg.adhesion : num 1 NA 1 NA 3 8 NA 1 NA 1 ...
## $ Epith.c.size : num NA 7 2 NA 2 7 2 2 2 2 ...
## $ Bare.nuclei : num 1 10 NA 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : num 3 3 3 3 3 NA 3 3 NA 2 ...
## $ Normal.nucleoli: num 1 NA 1 7 NA 7 1 1 1 NA ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 5 1 ...
v1 <- c( 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1 )
breast_cancer_y <- ifelse(c( v1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2 ) == 2, "malignant", "benign")
breast_cancer_y <- factor(breast_cancer_y, levels=c("benign", "malignant"))
str(breast_cancer_y)
## Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# Create custom trainControl: myControl
myControl <- caret::trainControl(
method = "cv", number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
# Apply median imputation: model
model <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = "medianImpute"
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: median imputation (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 630, 630, 628, 629, 629, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9923897 0.9695169 0.9421667
# Apply KNN imputation: model2
model2 <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = "knnImpute"
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model to console
model2
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: nearest neighbor imputation (9), centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 628, 630, 629, 629, 630, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9925008 0.9716908 0.9291667
# Fit glm with median imputation: model1
model1 <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = "medianImpute"
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model1
model1
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: median imputation (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 629, 630, 629, 630, 628, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9913635 0.9694203 0.9461667
# Fit glm with median imputation and standardization: model2
model2 <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = c("medianImpute", "center", "scale")
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model2
model2
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: median imputation (9), centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 628, 630, 629, 629, 629, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9917448 0.9694203 0.9418333
data(BloodBrain, package="caret") # produces logBBB (y) and bbbDescr (x)
bloodbrain_y <- logBBB
keyNames <- c( 'tpsa', 'nbasic', 'vsa_hyd', 'a_aro', 'weight', 'peoe_vsa.0', 'peoe_vsa.1', 'peoe_vsa.2', 'peoe_vsa.3', 'peoe_vsa.4', 'peoe_vsa.5', 'peoe_vsa.6', 'peoe_vsa.0.1', 'peoe_vsa.1.1', 'peoe_vsa.2.1', 'peoe_vsa.3.1', 'peoe_vsa.4.1', 'peoe_vsa.5.1', 'peoe_vsa.6.1', 'a_acc', 'a_acid', 'a_base', 'vsa_acc', 'vsa_acid', 'vsa_base', 'vsa_don', 'vsa_other', 'vsa_pol', 'slogp_vsa0', 'slogp_vsa1', 'slogp_vsa2', 'slogp_vsa3', 'slogp_vsa4', 'slogp_vsa5', 'slogp_vsa6', 'slogp_vsa7', 'slogp_vsa8', 'slogp_vsa9', 'smr_vsa0', 'smr_vsa1', 'smr_vsa2', 'smr_vsa3', 'smr_vsa4', 'smr_vsa5', 'smr_vsa6', 'smr_vsa7', 'tpsa.1', 'logp.o.w.', 'frac.anion7.', 'frac.cation7.', 'andrewbind', 'rotatablebonds', 'mlogp', 'clogp', 'mw', 'nocount', 'hbdnr', 'rule.of.5violations', 'prx', 'ub', 'pol', 'inthb', 'adistm', 'adistd', 'polar_area', 'nonpolar_area', 'psa_npsa', 'tcsa', 'tcpa', 'tcnp', 'ovality', 'surface_area', 'volume', 'most_negative_charge', 'most_positive_charge', 'sum_absolute_charge', 'dipole_moment', 'homo', 'lumo', 'hardness', 'ppsa1', 'ppsa2', 'ppsa3', 'pnsa1', 'pnsa2', 'pnsa3', 'fpsa1', 'fpsa2', 'fpsa3', 'fnsa1', 'fnsa2', 'fnsa3', 'wpsa1', 'wpsa2', 'wpsa3', 'wnsa1', 'wnsa2', 'wnsa3', 'dpsa1', 'dpsa2', 'dpsa3', 'rpcg', 'rncg', 'wpcs', 'wncs', 'sadh1', 'sadh2', 'sadh3', 'chdh1', 'chdh2', 'chdh3', 'scdh1', 'scdh2', 'scdh3', 'saaa1', 'saaa2', 'saaa3', 'chaa1', 'chaa2', 'chaa3', 'scaa1', 'scaa2', 'scaa3', 'ctdh', 'ctaa', 'mchg', 'achg', 'rdta', 'n_sp2', 'n_sp3', 'o_sp2', 'o_sp3' )
bloodbrain_x <- bbbDescr[, keyNames]
dim(bloodbrain_x)
## [1] 208 132
# Identify near zero variance predictors: remove_cols
remove_cols <- caret::nearZeroVar(bloodbrain_x, names = TRUE,
freqCut = 2, uniqueCut = 20)
# Get all column names from bloodbrain_x: all_cols
all_cols <- names(bloodbrain_x)
# Remove from data: bloodbrain_x_small
bloodbrain_x_small <- bloodbrain_x[ , setdiff(all_cols, remove_cols)]
# Fit model on reduced data: model
model <- caret::train(x = bloodbrain_x_small, y = bloodbrain_y, method = "glm")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# Print model to console
model
## Generalized Linear Model
##
## 208 samples
## 112 predictors
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ...
## Resampling results:
##
## RMSE Rsquared
## 1.782164 0.1089338
# Fit glm model using PCA: model
model <- caret::train(
x = bloodbrain_x, y = bloodbrain_y,
method = "glm", preProcess = "pca"
)
# Print model to console
model
## Generalized Linear Model
##
## 208 samples
## 132 predictors
##
## Pre-processing: principal component signal extraction (132),
## centered (132), scaled (132)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ...
## Resampling results:
##
## RMSE Rsquared
## 0.6397498 0.4168059
Chapter 5 - Selecting Models Case Study (Customer Churn)
Reusing a trainControl - to compare apples to apples, make sure that all the models use the same training/test splits:
Reintroduce glmnet - linear model with built-in variable selection:
Reintroduce random forest - often the second model to try on a new predictive model:
Comparing models - assess the quality of the predictions (apples to apples is gained by using the same test=train splits on the data):
More on resamples - many great methods and inspired the caretEnsembles package:
Example code includes:
data(churn, package="C50")
sum(is.na(churnTrain)) # 0
## [1] 0
dim(churnTrain) # 3333 x 20
## [1] 3333 20
keyStateNums <- c( 5, 4, 8, 3, 4, 3, 4, 3, 5, 4, 4, 3, 3, 9, 4, 1, 9, 8, 4, 3, 4, 5, 4, 6, 6,
1, 4, 3, 3, 8, 4, 7, 5, 5, 6, 7, 4, 6, 5, 6, 9, 3, 5, 4, 6, 11, 2, 4, 2, 9, 5
)
keyStateNames <- c( 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA',
'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS',
'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA',
'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'
)
keyIdx <- integer(0)
for (eachState in keyStateNames) {
keyIdx <- c(keyIdx,
sort(sample(as.integer(row.names(churnTrain[churnTrain$state == eachState, ])),
size=keyStateNums[match(eachState, keyStateNames)], replace=FALSE
)
)
)
}
churn_x <- churnTrain[keyIdx, ] %>%
mutate(international_planyes=as.integer(international_plan=="yes"),
area_codearea_code_415=as.integer(area_code=="area_code_415"),
area_codearea_code_510=as.integer(area_code=="area_code_510"),
voice_mail_planyes=as.integer(voice_mail_plan=="yes")
) %>%
select(-c(state, churn, area_code, international_plan, voice_mail_plan))
churn_y <- factor(churnTrain[keyIdx, "churn"], levels=c("no", "yes"))
stateCols <- matrix(data=0L, nrow=sum(keyStateNums), ncol=length(keyStateNums))
curCol <- 1
curRow <- 1
for (intCtr in cumsum(keyStateNums)) {
stateCols[curRow:intCtr, curCol] <- 1L
curCol <- curCol + 1
curRow <- intCtr + 1
}
stateDF <- as.data.frame(stateCols)
names(stateDF) <- paste0("state", keyStateNames)
churn_x <- cbind(churn_x, stateDF)
# Create custom indices: myFolds
myFolds <- caret::createFolds(churn_y, k = 5)
# Create reusable trainControl object: myControl
myControl <- caret::trainControl(
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE,
savePredictions = TRUE,
index = myFolds
)
# Fit glmnet model: model_glmnet
model_glmnet <- caret::train(
x = churn_x, y = churn_y,
metric = "ROC",
method = "glmnet",
trControl = myControl
)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-10
## + Fold1: alpha=0.10, lambda=0.02283
## - Fold1: alpha=0.10, lambda=0.02283
## + Fold1: alpha=0.55, lambda=0.02283
## - Fold1: alpha=0.55, lambda=0.02283
## + Fold1: alpha=1.00, lambda=0.02283
## - Fold1: alpha=1.00, lambda=0.02283
## + Fold2: alpha=0.10, lambda=0.02283
## - Fold2: alpha=0.10, lambda=0.02283
## + Fold2: alpha=0.55, lambda=0.02283
## - Fold2: alpha=0.55, lambda=0.02283
## + Fold2: alpha=1.00, lambda=0.02283
## - Fold2: alpha=1.00, lambda=0.02283
## + Fold3: alpha=0.10, lambda=0.02283
## - Fold3: alpha=0.10, lambda=0.02283
## + Fold3: alpha=0.55, lambda=0.02283
## - Fold3: alpha=0.55, lambda=0.02283
## + Fold3: alpha=1.00, lambda=0.02283
## - Fold3: alpha=1.00, lambda=0.02283
## + Fold4: alpha=0.10, lambda=0.02283
## - Fold4: alpha=0.10, lambda=0.02283
## + Fold4: alpha=0.55, lambda=0.02283
## - Fold4: alpha=0.55, lambda=0.02283
## + Fold4: alpha=1.00, lambda=0.02283
## - Fold4: alpha=1.00, lambda=0.02283
## + Fold5: alpha=0.10, lambda=0.02283
## - Fold5: alpha=0.10, lambda=0.02283
## + Fold5: alpha=0.55, lambda=0.02283
## - Fold5: alpha=0.55, lambda=0.02283
## + Fold5: alpha=1.00, lambda=0.02283
## - Fold5: alpha=1.00, lambda=0.02283
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.0228 on full training set
# Fit random forest: model_rf
model_rf <- caret::train(
x = churn_x, y = churn_y,
metric = "ROC",
method = "ranger",
trControl = myControl
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry=36
## - Fold1: mtry=36
## + Fold1: mtry=70
## - Fold1: mtry=70
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry=36
## - Fold2: mtry=36
## + Fold2: mtry=70
## - Fold2: mtry=70
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry=36
## - Fold3: mtry=36
## + Fold3: mtry=70
## - Fold3: mtry=70
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry=36
## - Fold4: mtry=36
## + Fold4: mtry=70
## - Fold4: mtry=70
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry=36
## - Fold5: mtry=36
## + Fold5: mtry=70
## - Fold5: mtry=70
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 36 on full training set
# Create model_list
model_list <- list(item1 = model_glmnet, item2 = model_rf)
# Pass model_list to resamples(): resamples
resamps <- caret::resamples(model_list)
# Summarize the results
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: item1, item2
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## item1 0.5436 0.5901 0.5959 0.6107 0.6133 0.7107 0
## item2 0.6649 0.6705 0.6824 0.6884 0.7113 0.7127 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## item1 0.9264 0.9451 0.9509 0.9534 0.9571 0.9877 0
## item2 0.9387 0.9634 0.9693 0.9681 0.9816 0.9877 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## item1 0.08108 0.08108 0.1081 0.1417 0.2162 0.2222 0
## item2 0.13510 0.18920 0.2222 0.2066 0.2432 0.2432 0
# Create bwplot
bwplot(resamps, metric="ROC")
# Create xyplot
xyplot(resamps, metric="ROC")
# Create ensemble model: stack
# Crashes out on my machine; not sure why . . .
# stack <- caretEnsemble::caretStack(model_list, method="glm")
# Look at summary
# summary(stack)
Chapter 1 - Jumping in
What is text mining? The process of distilling actionable insights from text:
Getting started - “bag of words” did not care about word types, so verbs and conjections and the like are treated the same as nouns:
Cleaning and pre-processing text - common pre-processing functions:
TDM (term-document matrix) and DTM (document-term matrix):
Example code includes:
library(qdap) # Will require R 3.3.1 or higher for dependency "slam"
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
##
## explain
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:dplyr':
##
## id
##
## Attaching package: 'qdap'
## The following object is masked from 'package:Matrix':
##
## %&%
## The following object is masked from 'package:purrr':
##
## %>%
## The following object is masked from 'package:dplyr':
##
## %>%
## The following object is masked from 'package:base':
##
## Filter
new_text <- "DataCamp is the first online learning platform that focuses on building the best learning experience specifically for Data Science. We have offices in Boston and Belgium and to date, we trained over 250,000 (aspiring) data scientists in over 150 countries. These data science enthusiasts completed more than 9 million exercises. You can take free beginner courses, or subscribe for $25/month to get access to all premium courses."
# Print new_text to the console
new_text
## [1] "DataCamp is the first online learning platform that focuses on building the best learning experience specifically for Data Science. We have offices in Boston and Belgium and to date, we trained over 250,000 (aspiring) data scientists in over 150 countries. These data science enthusiasts completed more than 9 million exercises. You can take free beginner courses, or subscribe for $25/month to get access to all premium courses."
# Find the 10 most frequent terms: term_count
term_count <- qdap::freq_terms(new_text, 10)
# Plot term_count
plot(term_count)
# Import text data
rawTweets <- read.csv("BagOfWordsTweetData_v001.csv", stringsAsFactors=FALSE)
str(rawTweets)
## 'data.frame': 1000 obs. of 2 variables:
## $ Coffee : chr " @ayyytylerb that is so true drink lots of coffee" " RT @bryzy_brib: Senior March tmw morning at 7:25 A.M. in the SENIOR lot. Get up early, make yo coffee/breakfast, cus this will"| __truncated__ " If you believe in #gunsense tomorrow would be a very good day to have your coffee any place BUT @Starbucks Guns+Coffee=#nosens"| __truncated__ " My cute coffee mug. http://t.co/2udvMU6XIG" ...
## $ Chardonnay: chr " RT @oceanclub: @eilisohanlon @stonyjim @vonprond Eilis, I'm from Pearse St and even I can tell a Chardonnay from so?" " ?@roystbaggage: 'Go to your Auntie Chardonnay and she will help you piss up against that wall' - the scum of Dover.?what's thi"| __truncated__ " Big thank you to Ian at Fowles wine for making me a Chardonnay drinker. @LadiesWhoShoot #wrongwayround http://t.co/KiA2StsOEO" " RT @oceanclub: @eilisohanlon @stonyjim @vonprond Eilis, I'm from Pearse St and even I can tell a Chardonnay from so?" ...
# Isolate coffee text from tweets: coffee_tweets
coffee_tweets <- rawTweets$Coffee
# Load tm
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:qdap':
##
## ngrams
## The following object is masked from 'package:ggplot2':
##
## annotate
##
## Attaching package: 'tm'
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
# Make a vector source: coffee_source
coffee_source <- VectorSource(coffee_tweets)
# Make a volatile corpus: coffee_corpus
coffee_corpus <- VCorpus(coffee_source)
# Print out coffee_corpus
coffee_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1000
# Print data on the 15th tweet in coffee_corpus
coffee_corpus[[15]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 112
# Print the content of the 15th tweet in coffee_corpus
coffee_corpus[[15]]$content
## [1] " @HeatherWhaley I was about 2 joke it takes 2 hands to hold hot coffee...then I read headline! #Don'tDrinkNShoot"
example_text <- data.frame(num=1:3, Author1=c('Text mining is a great time.', 'Text analysis provides insights', 'qdap and tm are used in text mining'), Author2=c('R is a great language', 'R has many uses', 'DataCamp is cool!'), stringsAsFactors=FALSE)
# Print example_text to the console
example_text
## num Author1 Author2
## 1 1 Text mining is a great time. R is a great language
## 2 2 Text analysis provides insights R has many uses
## 3 3 qdap and tm are used in text mining DataCamp is cool!
# Create a DataframeSource on columns 2 and 3: df_source
df_source <- tm::DataframeSource(example_text[,-1])
# Convert df_source to a corpus: df_corpus
df_corpus <- tm::VCorpus(df_source)
# Examine df_corpus
df_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
# Create a VectorSource on column 3: vec_source
vec_source <- tm::VectorSource(example_text[, 3])
# Convert vec_source to a corpus: vec_corpus
vec_corpus <- tm::VCorpus(vec_source)
# Examine vec_corpus
vec_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
# Create the object: text
text <- "<b>She</b> woke up at 6 A.M. It\'s so early! She was only 10% awake and began drinking coffee in front of her computer."
# All lowercase
tolower(text)
## [1] "<b>she</b> woke up at 6 a.m. it's so early! she was only 10% awake and began drinking coffee in front of her computer."
# Remove punctuation
tm::removePunctuation(text)
## [1] "bSheb woke up at 6 AM Its so early She was only 10 awake and began drinking coffee in front of her computer"
# Remove numbers
tm::removeNumbers(text)
## [1] "<b>She</b> woke up at A.M. It's so early! She was only % awake and began drinking coffee in front of her computer."
# Remove whitespace
tm::stripWhitespace(text)
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Remove text within brackets
bracketX(text)
## [1] "She woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace numbers with words
replace_number(text)
## [1] "<b>She</b> woke up at six A.M. It's so early! She was only ten% awake and began drinking coffee in front of her computer."
# Replace abbreviations
replace_abbreviation(text)
## [1] "<b>She</b> woke up at 6 AM It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace contractions
replace_contraction(text)
## [1] "<b>She</b> woke up at 6 A.M. it is so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace symbols with words
replace_symbol(text)
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10 percent awake and began drinking coffee in front of her computer."
# List standard English stop words
tm::stopwords("en")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
# Print text without standard stop words
tm::removeWords(text, tm::stopwords("en"))
## [1] "<b>She</b> woke 6 A.M. It's early! She 10% awake began drinking coffee front computer."
# Add "coffee" and "bean" to the list: new_stops
new_stops <- c("coffee", "bean", tm::stopwords("en"))
# Remove stop words from text
tm::removeWords(text, new_stops)
## [1] "<b>She</b> woke 6 A.M. It's early! She 10% awake began drinking front computer."
# Create complicate
complicate <- c("complicated", "complication", "complicatedly")
# Perform word stemming: stem_doc
stem_doc <- tm::stemDocument(complicate)
# Create the completion dictionary: comp_dict
comp_dict <- "complicate"
# Perform stem completion: complete_text
complete_text <- tm::stemCompletion(stem_doc, comp_dict)
# Print complete_text
complete_text
## complic complic complic
## "complicate" "complicate" "complicate"
# NEED FULL DICTIONARIES FOR THESE
# Remove punctuation: rm_punc
# rm_punc <- tm::removePunctuation(text_doc)
# Create character vector: n_char_vec
# n_char_vec <- unlist(strsplit(rm_punc, split = ' '))
# Perform word stemming: stem_doc
# stem_doc <- tm::stemDocument(n_char_vec)
# Print stem_doc
# stem_doc
# Re-complete stemmed document: complete_doc
# complete_doc <- tm::stemCompletion(stem_doc, comp_dict)
# Print complete_doc
# complete_doc
### DO NOT HAVE THE TWEET_CORP FILE (probably the coffee tweets corpus mentioned above)
# Alter the function code to match the instructions
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "coffee", "mug"))
return(corpus)
}
# Apply your customized function to the tweet_corp: clean_corp
# Applied to coffee_corpus instead
clean_corp <- clean_corpus(coffee_corpus)
# Print out a cleaned up tweet
clean_corp[[227]][1]
## $content
## [1] " also dogs arent smart enough dip donut eat part thats dipped ladyandthetramp"
# Print out the same tweet in original form
coffee_tweets[227]
## [1] " Also, dogs aren't smart enough to dip the donut in the coffee and then eat the part that's been dipped. #ladyandthetramp"
# Create the dtm from the corpus: coffee_dtm
coffee_dtm <- DocumentTermMatrix(clean_corp)
# Print out coffee_dtm data
coffee_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 3075)>>
## Non-/sparse entries: 7384/3067616
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Convert coffee_dtm to a matrix: coffee_m
coffee_m <- as.matrix(coffee_dtm)
# Print the dimensions of coffee_m
dim(coffee_m)
## [1] 1000 3075
# Review a portion of the matrix
coffee_m[148:150, 2587:2590]
## Terms
## Docs stampedeblue stand star starbucks
## 148 0 0 0 0
## 149 0 0 0 0
## 150 0 0 0 0
# Create a TDM from clean_corp: coffee_tdm
coffee_tdm <- TermDocumentMatrix(clean_corp)
# Print coffee_tdm data
coffee_tdm
## <<TermDocumentMatrix (terms: 3075, documents: 1000)>>
## Non-/sparse entries: 7384/3067616
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Convert coffee_tdm to a matrix: coffee_m
coffee_m <- as.matrix(coffee_tdm)
# Print the dimensions of the matrix
dim(coffee_m)
## [1] 3075 1000
# Review a portion of the matrix
coffee_m[2587:2590, 148:150]
## Docs
## Terms 148 149 150
## stampedeblue 0 0 0
## stand 0 0 0
## star 0 0 0
## starbucks 0 0 0
Chapter 2 - Word Clouds and Visuals
Common text mining visuals - good visualizations help with making quick conclusions:
Introduction to word clouds - more popular for of word plot, with size typically defaulting to frequency:
Other word clouds and word networks:
Example code includes:
# Create a matrix: coffee_m
coffee_m <- as.matrix(coffee_tdm)
# Calculate the rowSums: term_frequency
term_frequency <- rowSums(coffee_m)
# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing=TRUE)
# View the top 10 most common words
term_frequency[1:10]
## like cup shop just get morning want drinking
## 111 103 69 66 62 57 49 47
## can looks
## 45 45
# Plot a barchart of the 10 most common words
barplot(term_frequency[1:10], las=2, col="tan")
# Create frequency
frequency <- qdap::freq_terms(coffee_tweets, top=10, at.least=3, stopwords="Top200Words")
# Make a frequency barchart
plot(frequency)
# Create frequency2
frequency2 <- qdap::freq_terms(coffee_tweets, top=10, at.least=3, stopwords=tm::stopwords("english"))
# Make a frequency2 barchart
plot(frequency2)
# Creating a smaller version of the second term_frequency file (only words with 5+ appearances)
term_frequency <- c( 824, 104, 83, 76, 75, 63, 52, 47, 43, 35, 34, 32, 32, 25, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 )
names(term_frequency) <- c( 'chardonnay', 'marvin', 'wine', 'gaye', 'just', 'glass', 'like', 'bottle', 'lol', 'little', 'rose', 'dont', 'get', 'now', 'ass', 'can', 'know', 'love', 'drink', 'good', 'will', 'girl', 'night', 'time', 'cabernet', 'chocolate', 'still', 'thats', 'think', 'unoaked', 'well', 'milkshake', 'see', 'big', 'double', 'fists', 'inspired', 'jinkx', 'jinkxmonsoon', 'polite', 'really', 'better', 'dinner', 'got', 'httptcodudylkw', 'charles', 'fine', 'full', 'mood', 'nice', 'shiraz', 'day', 'drinking', 'naked', 'pwcwines', 'set', 'white', 'chicken', 'fancy', 'need', 'winning', 'always', 'beauty', 'board', 'bushes', 'competition', 'donjon', 'fell', 'grace', 'meeting', 'moms', 'one', 'pinot', 'porch', 'remember', 'wait', 'wanna', 'wonderwines', 'yall', 'best', 'https', 'januaryjames', 'live', 'make', 'new', 'pretty', 'right', 'way', 'chipotletwins', 'happy', 'name', 'oaked', 'old', 'shit', 'sippchardonnay', 'tasting', 'thanks', 'call', 'called', 'going', 'great', 'people', 'say', 'try', 'want', 'yes', 'brought', 'cant', 'first', 'fourvines', 'lot', 'noir', 'tell', 'today', 'tonight', 'video', 'winewednesday', 'cake', 'check', 'cute', 'even', 'game', 'jason', 'last', 'miss', 'mzchardonnay', 'sauce', 'sean', 'song', 'take', 'tho', 'valley', 'wines', 'around', 'away', 'back', 'bought', 'box', 'classy', 'cream', 'estate', 'fuck', 'gay', 'hey', 'home', 'ive', 'let', 'liked', 'lingerie', 'lmfaoo', 'mom', 'moscato', 'mushroom', 'please', 'rainbow', 'red', 'sauvignon', 'school', 'thank', 'vineyards', 'aint', 'beautiful', 'black', 'blue', 'boys', 'cheers', 'cool', 'dairy', 'food', 'goony', 'green', 'hoes', 'ill', 'ima', 'irishtexan', 'jamesthewineguy', 'lady', 'life', 'lil', 'listen', 'man', 'mantsoepout', 'mind', 'much', 'nah', 'qveenm', 'room', 'sippin', 'sipping', 'smh', 'text', 'thegamebet', 'veraison', 'asking', 'bcwine', 'bit', 'blanc', 'boity', 'buttery', 'chard', 'confessyourunpopularopinion', 'date', 'debortoliwines', 'drank', 'drunk', 'fruit', 'give', 'house', 'huntervalley', 'keep', 'ladieswhoshoot', 'late', 'lovely', 'month', 'never', 'notes', 'okay', 'paying', 'playing', 'question', 'seriously', 'simple', 'someone', 'started', 'stay', 'thought', 'ultimatebgc', 'vineyard', 'visit', 'walla', 'youre', 'also', 'answer', 'anytime', 'baby', 'bad', 'cause', 'citrus', 'come', 'crisp', 'ctfu', 'cyclone', 'delicious', 'dick', 'didnt', 'doesnt', 'enjoy', 'every', 'friend', 'funny', 'genay', 'glad', 'glasses', 'gonna', 'hes', 'hold', 'http', 'httptc', 'httptcoawdmglpmg', 'join', 'kinda', 'known', 'launches', 'may', 'michael', 'movies', 'next', 'paired', 'perfect', 'pinotnoir', 'point', 'poor', 'put', 'sadlife', 'said', 'salad', 'santa', 'scene', 'shout', 'special', 'stop', 'summer', 'tasha', 'work' )
# Load wordcloud package
library(wordcloud)
# Print the first 10 entries in term_frequency
term_frequency[1:10]
## chardonnay marvin wine gaye just glass
## 824 104 83 76 75 63
## like bottle lol little
## 52 47 43 35
# Create word_freqs
word_freqs <- data.frame(term=names(term_frequency), num=term_frequency)
# Create a wordcloud for the values in word_freqs
wordcloud(word_freqs$term, word_freqs$num, max.words=100, colors="red")
# Create chardonnay_corp
chardonnay_tweets <- rawTweets$Chardonnay
chardonnay_source <- VectorSource(chardonnay_tweets)
chardonnay_corp <- VCorpus(chardonnay_source)
# Add new stop words to clean_corpus()
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "amp", "chardonnay", "wine", "glass"))
return(corpus)
}
# Create clean_chardonnay
clean_chardonnay <- clean_corpus(chardonnay_corp)
# Create chardonnay_tdm
chardonnay_tdm <- TermDocumentMatrix(clean_chardonnay)
# Create chardonnay_m
chardonnay_m <- as.matrix(chardonnay_tdm)
# Create chardonnay_words
chardonnay_words <- rowSums(chardonnay_m)
# Copying over the portion of chardonnay_words where there is frequency of 5+
# chardonnay_words <- c( 7, 5, 14, 5, 5, 8, 6, 24, 8, 5, 8, 5, 6, 7, 14, 13, 18, 19, 6, 7, 6, 7, 14, 6, 47, 8, 8, 7, 10, 14, 6, 21, 9, 11, 11, 24, 10, 5, 6, 17, 9, 7, 15, 12, 21, 5, 8, 5, 14, 6, 7, 8, 5, 5, 9, 5, 7, 6, 16, 6, 5, 5, 5, 18, 5, 14, 32, 19, 6, 23, 16, 6, 5, 8, 9, 5, 15, 14, 17, 10, 19, 7, 10, 5, 6, 8, 17, 5, 9, 8, 76, 5, 32, 22, 6, 5, 5, 11, 5, 23, 7, 18, 14, 11, 7, 12, 5, 8, 7, 5, 8, 6, 5, 13, 5, 5, 18, 6, 7, 7, 19, 7, 8, 7, 13, 9, 19, 19, 5, 75, 6, 5, 24, 5, 6, 7, 9, 6, 5, 8, 7, 52, 8, 7, 8, 7, 35, 13, 8, 43, 10, 24, 6, 13, 7, 7, 104, 5, 14, 5, 20, 7, 9, 8, 14, 6, 17, 8, 5, 7, 8, 9, 7, 16, 12, 15, 6, 13, 5, 17, 22, 10, 6, 25, 12, 6, 12, 14, 5, 6, 11, 5, 14, 5, 6, 8, 5, 19, 5, 14, 13, 5, 16, 6, 7, 8, 19, 8, 14, 13, 7, 34, 5, 5, 5, 5, 9, 8, 11, 5, 8, 9, 20, 6, 16, 17, 12, 5, 6, 12, 7, 7, 7, 6, 9, 5, 6, 6, 21, 5, 5, 9, 5, 12, 10, 7, 8, 12, 21, 7, 21, 9, 6, 22, 10, 10, 11, 6, 21, 9, 7, 10, 6, 8, 6, 14, 6, 14, 11, 13, 21, 16, 23, 9, 10, 15, 14, 5, 14, 11, 6 )
# names(chardonnay_words) <- c( 'aint', 'also', 'always', 'answer', 'anytime', 'around', 'asking', 'ass', 'away', 'baby', 'back', 'bad', 'bcwine', 'beautiful', 'beauty', 'best', 'better', 'big', 'bit', 'black', 'blanc', 'blue', 'board', 'boity', 'bottle', 'bought', 'box', 'boys', 'brought', 'bushes', 'buttery', 'cabernet', 'cake', 'call', 'called', 'can', 'cant', 'cause', 'chard', 'charles', 'check', 'cheers', 'chicken', 'chipotletwins', 'chocolate', 'citrus', 'classy', 'come', 'competition', 'confessyourunpopularopinion', 'cool', 'cream', 'crisp', 'ctfu', 'cute', 'cyclone', 'dairy', 'date', 'day', 'debortoliwines', 'delicious', 'dick', 'didnt', 'dinner', 'doesnt', 'donjon', 'dont', 'double', 'drank', 'drink', 'drinking', 'drunk', 'enjoy', 'estate', 'even', 'every', 'fancy', 'fell', 'fine', 'first', 'fists', 'food', 'fourvines', 'friend', 'fruit', 'fuck', 'full', 'funny', 'game', 'gay', 'gaye', 'genay', 'get', 'girl', 'give', 'glad', 'glasses', 'going', 'gonna', 'good', 'goony', 'got', 'grace', 'great', 'green', 'happy', 'hes', 'hey', 'hoes', 'hold', 'home', 'house', 'http', 'https', 'httptc', 'httptcoawdmglpmg', 'httptcodudylkw', 'huntervalley', 'ill', 'ima', 'inspired', 'irishtexan', 'ive', 'jamesthewineguy', 'januaryjames', 'jason', 'jinkx', 'jinkxmonsoon', 'join', 'just', 'keep', 'kinda', 'know', 'known', 'ladieswhoshoot', 'lady', 'last', 'late', 'launches', 'let', 'life', 'like', 'liked', 'lil', 'lingerie', 'listen', 'little', 'live', 'lmfaoo', 'lol', 'lot', 'love', 'lovely', 'make', 'man', 'mantsoepout', 'marvin', 'may', 'meeting', 'michael', 'milkshake', 'mind', 'miss', 'mom', 'moms', 'month', 'mood', 'moscato', 'movies', 'much', 'mushroom', 'mzchardonnay', 'nah', 'naked', 'name', 'need', 'never', 'new', 'next', 'nice', 'night', 'noir', 'notes', 'now', 'oaked', 'okay', 'old', 'one', 'paired', 'paying', 'people', 'perfect', 'pinot', 'pinotnoir', 'playing', 'please', 'point', 'polite', 'poor', 'porch', 'pretty', 'put', 'pwcwines', 'question', 'qveenm', 'rainbow', 'really', 'red', 'remember', 'right', 'room', 'rose', 'sadlife', 'said', 'salad', 'santa', 'sauce', 'sauvignon', 'say', 'scene', 'school', 'sean', 'see', 'seriously', 'set', 'shiraz', 'shit', 'shout', 'simple', 'sippchardonnay', 'sippin', 'sipping', 'smh', 'someone', 'song', 'special', 'started', 'stay', 'still', 'stop', 'summer', 'take', 'tasha', 'tasting', 'tell', 'text', 'thank', 'thanks', 'thats', 'thegamebet', 'think', 'tho', 'thought', 'time', 'today', 'tonight', 'try', 'ultimatebgc', 'unoaked', 'valley', 'veraison', 'video', 'vineyard', 'vineyards', 'visit', 'wait', 'walla', 'wanna', 'want', 'way', 'well', 'white', 'will', 'wines', 'winewednesday', 'winning', 'wonderwines', 'work', 'yall', 'yes', 'youre' )
# Sort the chardonnay_words in descending order
chardonnay_words <- sort(chardonnay_words, decreasing=TRUE)
# Print the 6 most frequent chardonnay terms
sort(chardonnay_words, decreasing=TRUE)[1:6]
## marvin gaye just like bottle lol
## 104 76 75 52 47 43
# Create chardonnay_freqs
chardonnay_freqs <- data.frame(term=names(chardonnay_words), num=chardonnay_words)
# Create a wordcloud for the values in word_freqs
wordcloud(chardonnay_freqs$term, chardonnay_freqs$num, max.words=50, colors="red")
# Print the list of colors
colors()
## [1] "white" "aliceblue" "antiquewhite"
## [4] "antiquewhite1" "antiquewhite2" "antiquewhite3"
## [7] "antiquewhite4" "aquamarine" "aquamarine1"
## [10] "aquamarine2" "aquamarine3" "aquamarine4"
## [13] "azure" "azure1" "azure2"
## [16] "azure3" "azure4" "beige"
## [19] "bisque" "bisque1" "bisque2"
## [22] "bisque3" "bisque4" "black"
## [25] "blanchedalmond" "blue" "blue1"
## [28] "blue2" "blue3" "blue4"
## [31] "blueviolet" "brown" "brown1"
## [34] "brown2" "brown3" "brown4"
## [37] "burlywood" "burlywood1" "burlywood2"
## [40] "burlywood3" "burlywood4" "cadetblue"
## [43] "cadetblue1" "cadetblue2" "cadetblue3"
## [46] "cadetblue4" "chartreuse" "chartreuse1"
## [49] "chartreuse2" "chartreuse3" "chartreuse4"
## [52] "chocolate" "chocolate1" "chocolate2"
## [55] "chocolate3" "chocolate4" "coral"
## [58] "coral1" "coral2" "coral3"
## [61] "coral4" "cornflowerblue" "cornsilk"
## [64] "cornsilk1" "cornsilk2" "cornsilk3"
## [67] "cornsilk4" "cyan" "cyan1"
## [70] "cyan2" "cyan3" "cyan4"
## [73] "darkblue" "darkcyan" "darkgoldenrod"
## [76] "darkgoldenrod1" "darkgoldenrod2" "darkgoldenrod3"
## [79] "darkgoldenrod4" "darkgray" "darkgreen"
## [82] "darkgrey" "darkkhaki" "darkmagenta"
## [85] "darkolivegreen" "darkolivegreen1" "darkolivegreen2"
## [88] "darkolivegreen3" "darkolivegreen4" "darkorange"
## [91] "darkorange1" "darkorange2" "darkorange3"
## [94] "darkorange4" "darkorchid" "darkorchid1"
## [97] "darkorchid2" "darkorchid3" "darkorchid4"
## [100] "darkred" "darksalmon" "darkseagreen"
## [103] "darkseagreen1" "darkseagreen2" "darkseagreen3"
## [106] "darkseagreen4" "darkslateblue" "darkslategray"
## [109] "darkslategray1" "darkslategray2" "darkslategray3"
## [112] "darkslategray4" "darkslategrey" "darkturquoise"
## [115] "darkviolet" "deeppink" "deeppink1"
## [118] "deeppink2" "deeppink3" "deeppink4"
## [121] "deepskyblue" "deepskyblue1" "deepskyblue2"
## [124] "deepskyblue3" "deepskyblue4" "dimgray"
## [127] "dimgrey" "dodgerblue" "dodgerblue1"
## [130] "dodgerblue2" "dodgerblue3" "dodgerblue4"
## [133] "firebrick" "firebrick1" "firebrick2"
## [136] "firebrick3" "firebrick4" "floralwhite"
## [139] "forestgreen" "gainsboro" "ghostwhite"
## [142] "gold" "gold1" "gold2"
## [145] "gold3" "gold4" "goldenrod"
## [148] "goldenrod1" "goldenrod2" "goldenrod3"
## [151] "goldenrod4" "gray" "gray0"
## [154] "gray1" "gray2" "gray3"
## [157] "gray4" "gray5" "gray6"
## [160] "gray7" "gray8" "gray9"
## [163] "gray10" "gray11" "gray12"
## [166] "gray13" "gray14" "gray15"
## [169] "gray16" "gray17" "gray18"
## [172] "gray19" "gray20" "gray21"
## [175] "gray22" "gray23" "gray24"
## [178] "gray25" "gray26" "gray27"
## [181] "gray28" "gray29" "gray30"
## [184] "gray31" "gray32" "gray33"
## [187] "gray34" "gray35" "gray36"
## [190] "gray37" "gray38" "gray39"
## [193] "gray40" "gray41" "gray42"
## [196] "gray43" "gray44" "gray45"
## [199] "gray46" "gray47" "gray48"
## [202] "gray49" "gray50" "gray51"
## [205] "gray52" "gray53" "gray54"
## [208] "gray55" "gray56" "gray57"
## [211] "gray58" "gray59" "gray60"
## [214] "gray61" "gray62" "gray63"
## [217] "gray64" "gray65" "gray66"
## [220] "gray67" "gray68" "gray69"
## [223] "gray70" "gray71" "gray72"
## [226] "gray73" "gray74" "gray75"
## [229] "gray76" "gray77" "gray78"
## [232] "gray79" "gray80" "gray81"
## [235] "gray82" "gray83" "gray84"
## [238] "gray85" "gray86" "gray87"
## [241] "gray88" "gray89" "gray90"
## [244] "gray91" "gray92" "gray93"
## [247] "gray94" "gray95" "gray96"
## [250] "gray97" "gray98" "gray99"
## [253] "gray100" "green" "green1"
## [256] "green2" "green3" "green4"
## [259] "greenyellow" "grey" "grey0"
## [262] "grey1" "grey2" "grey3"
## [265] "grey4" "grey5" "grey6"
## [268] "grey7" "grey8" "grey9"
## [271] "grey10" "grey11" "grey12"
## [274] "grey13" "grey14" "grey15"
## [277] "grey16" "grey17" "grey18"
## [280] "grey19" "grey20" "grey21"
## [283] "grey22" "grey23" "grey24"
## [286] "grey25" "grey26" "grey27"
## [289] "grey28" "grey29" "grey30"
## [292] "grey31" "grey32" "grey33"
## [295] "grey34" "grey35" "grey36"
## [298] "grey37" "grey38" "grey39"
## [301] "grey40" "grey41" "grey42"
## [304] "grey43" "grey44" "grey45"
## [307] "grey46" "grey47" "grey48"
## [310] "grey49" "grey50" "grey51"
## [313] "grey52" "grey53" "grey54"
## [316] "grey55" "grey56" "grey57"
## [319] "grey58" "grey59" "grey60"
## [322] "grey61" "grey62" "grey63"
## [325] "grey64" "grey65" "grey66"
## [328] "grey67" "grey68" "grey69"
## [331] "grey70" "grey71" "grey72"
## [334] "grey73" "grey74" "grey75"
## [337] "grey76" "grey77" "grey78"
## [340] "grey79" "grey80" "grey81"
## [343] "grey82" "grey83" "grey84"
## [346] "grey85" "grey86" "grey87"
## [349] "grey88" "grey89" "grey90"
## [352] "grey91" "grey92" "grey93"
## [355] "grey94" "grey95" "grey96"
## [358] "grey97" "grey98" "grey99"
## [361] "grey100" "honeydew" "honeydew1"
## [364] "honeydew2" "honeydew3" "honeydew4"
## [367] "hotpink" "hotpink1" "hotpink2"
## [370] "hotpink3" "hotpink4" "indianred"
## [373] "indianred1" "indianred2" "indianred3"
## [376] "indianred4" "ivory" "ivory1"
## [379] "ivory2" "ivory3" "ivory4"
## [382] "khaki" "khaki1" "khaki2"
## [385] "khaki3" "khaki4" "lavender"
## [388] "lavenderblush" "lavenderblush1" "lavenderblush2"
## [391] "lavenderblush3" "lavenderblush4" "lawngreen"
## [394] "lemonchiffon" "lemonchiffon1" "lemonchiffon2"
## [397] "lemonchiffon3" "lemonchiffon4" "lightblue"
## [400] "lightblue1" "lightblue2" "lightblue3"
## [403] "lightblue4" "lightcoral" "lightcyan"
## [406] "lightcyan1" "lightcyan2" "lightcyan3"
## [409] "lightcyan4" "lightgoldenrod" "lightgoldenrod1"
## [412] "lightgoldenrod2" "lightgoldenrod3" "lightgoldenrod4"
## [415] "lightgoldenrodyellow" "lightgray" "lightgreen"
## [418] "lightgrey" "lightpink" "lightpink1"
## [421] "lightpink2" "lightpink3" "lightpink4"
## [424] "lightsalmon" "lightsalmon1" "lightsalmon2"
## [427] "lightsalmon3" "lightsalmon4" "lightseagreen"
## [430] "lightskyblue" "lightskyblue1" "lightskyblue2"
## [433] "lightskyblue3" "lightskyblue4" "lightslateblue"
## [436] "lightslategray" "lightslategrey" "lightsteelblue"
## [439] "lightsteelblue1" "lightsteelblue2" "lightsteelblue3"
## [442] "lightsteelblue4" "lightyellow" "lightyellow1"
## [445] "lightyellow2" "lightyellow3" "lightyellow4"
## [448] "limegreen" "linen" "magenta"
## [451] "magenta1" "magenta2" "magenta3"
## [454] "magenta4" "maroon" "maroon1"
## [457] "maroon2" "maroon3" "maroon4"
## [460] "mediumaquamarine" "mediumblue" "mediumorchid"
## [463] "mediumorchid1" "mediumorchid2" "mediumorchid3"
## [466] "mediumorchid4" "mediumpurple" "mediumpurple1"
## [469] "mediumpurple2" "mediumpurple3" "mediumpurple4"
## [472] "mediumseagreen" "mediumslateblue" "mediumspringgreen"
## [475] "mediumturquoise" "mediumvioletred" "midnightblue"
## [478] "mintcream" "mistyrose" "mistyrose1"
## [481] "mistyrose2" "mistyrose3" "mistyrose4"
## [484] "moccasin" "navajowhite" "navajowhite1"
## [487] "navajowhite2" "navajowhite3" "navajowhite4"
## [490] "navy" "navyblue" "oldlace"
## [493] "olivedrab" "olivedrab1" "olivedrab2"
## [496] "olivedrab3" "olivedrab4" "orange"
## [499] "orange1" "orange2" "orange3"
## [502] "orange4" "orangered" "orangered1"
## [505] "orangered2" "orangered3" "orangered4"
## [508] "orchid" "orchid1" "orchid2"
## [511] "orchid3" "orchid4" "palegoldenrod"
## [514] "palegreen" "palegreen1" "palegreen2"
## [517] "palegreen3" "palegreen4" "paleturquoise"
## [520] "paleturquoise1" "paleturquoise2" "paleturquoise3"
## [523] "paleturquoise4" "palevioletred" "palevioletred1"
## [526] "palevioletred2" "palevioletred3" "palevioletred4"
## [529] "papayawhip" "peachpuff" "peachpuff1"
## [532] "peachpuff2" "peachpuff3" "peachpuff4"
## [535] "peru" "pink" "pink1"
## [538] "pink2" "pink3" "pink4"
## [541] "plum" "plum1" "plum2"
## [544] "plum3" "plum4" "powderblue"
## [547] "purple" "purple1" "purple2"
## [550] "purple3" "purple4" "red"
## [553] "red1" "red2" "red3"
## [556] "red4" "rosybrown" "rosybrown1"
## [559] "rosybrown2" "rosybrown3" "rosybrown4"
## [562] "royalblue" "royalblue1" "royalblue2"
## [565] "royalblue3" "royalblue4" "saddlebrown"
## [568] "salmon" "salmon1" "salmon2"
## [571] "salmon3" "salmon4" "sandybrown"
## [574] "seagreen" "seagreen1" "seagreen2"
## [577] "seagreen3" "seagreen4" "seashell"
## [580] "seashell1" "seashell2" "seashell3"
## [583] "seashell4" "sienna" "sienna1"
## [586] "sienna2" "sienna3" "sienna4"
## [589] "skyblue" "skyblue1" "skyblue2"
## [592] "skyblue3" "skyblue4" "slateblue"
## [595] "slateblue1" "slateblue2" "slateblue3"
## [598] "slateblue4" "slategray" "slategray1"
## [601] "slategray2" "slategray3" "slategray4"
## [604] "slategrey" "snow" "snow1"
## [607] "snow2" "snow3" "snow4"
## [610] "springgreen" "springgreen1" "springgreen2"
## [613] "springgreen3" "springgreen4" "steelblue"
## [616] "steelblue1" "steelblue2" "steelblue3"
## [619] "steelblue4" "tan" "tan1"
## [622] "tan2" "tan3" "tan4"
## [625] "thistle" "thistle1" "thistle2"
## [628] "thistle3" "thistle4" "tomato"
## [631] "tomato1" "tomato2" "tomato3"
## [634] "tomato4" "turquoise" "turquoise1"
## [637] "turquoise2" "turquoise3" "turquoise4"
## [640] "violet" "violetred" "violetred1"
## [643] "violetred2" "violetred3" "violetred4"
## [646] "wheat" "wheat1" "wheat2"
## [649] "wheat3" "wheat4" "whitesmoke"
## [652] "yellow" "yellow1" "yellow2"
## [655] "yellow3" "yellow4" "yellowgreen"
# Print the wordcloud with the specified colors
wordcloud(chardonnay_freqs$term,
chardonnay_freqs$num,
max.words = 100,
colors = c("grey80", "darkgoldenrod1", "tomato")
)
# List the available colors
display.brewer.all()
# Create purple_orange
purple_orange <- brewer.pal(10, "PuOr")
# Drop 2 faintest colors
purple_orange <- purple_orange[-(1:2)]
# Create a wordcloud with purple_orange palette
wordcloud(chardonnay_freqs$term, chardonnay_freqs$num, max.words = 100, colors = purple_orange)
# Create all_coffee
all_coffee <- paste(coffee_tweets, collapse=" ")
# Create all_chardonnay
all_chardonnay <- paste(chardonnay_tweets, collapse=" ")
# Create all_tweets
all_tweets <- c(all_coffee, all_chardonnay)
# Convert to a vector source
all_tweets <- VectorSource(all_tweets)
# Create all_corpus
all_corpus <- VCorpus(all_tweets)
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "amp", "glass", "chardonnay", "coffee"))
return(corpus)
}
# Clean the corpus
all_clean <- clean_corpus(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_clean)
# Create all_m
all_tdm_m <- as.matrix(all_tdm)
# Print a commonality cloud
commonality.cloud(all_tdm_m, colors="steelblue1", max.words=100)
# Clean the corpus
all_clean <- clean_corpus(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_clean)
# Give the columns distinct names
colnames(all_tdm) <- c("coffee", "chardonnay")
# Create all_m
all_tdm_m <- as.matrix(all_tdm)
# Create comparison cloud
comparison.cloud(all_tdm_m, colors=c("orange", "blue"), max.words=50)
# Create common_words
common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
# Combine common_words and difference
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]
# Create top25_df
top25_df <- data.frame(x = common_words[1:25, 1],
y = common_words[1:25, 2],
labels = rownames(common_words[1:25, ]))
# Create the pyramid plot
plotrix::pyramid.plot(top25_df$x, top25_df$y, labels=top25_df$labels,
gap=8, top.labels = c("Coffee", "Words", "Chardonnay"),
main = "Words in Common", laxlab = NULL,
raxlab = NULL, unit = NULL
)
## [1] 5.1 4.1 4.1 2.1
# Word association
word_associate(coffee_tweets, match.string = c("barista"),
stopwords = c(Top200Words, "coffee", "amp"),
network.plot = TRUE, cloud.colors = c("gray85", "darkred"))
## Warning in text2color(words = V(g)$label, recode.words = target.words,
## colors = label.colors): length of colors should be 1 more than length of
## recode.words
## row group unit text
## 1 544 all 544 RT @Barista_kyo: #coffee #latte #soylatte #thinkcoffee # # # # @ think coffee http://t.co/Hmy9RPRWTZ
## 2 569 all 569 RT @ReversoSmith: What a beautiful mess! #portafilter #coffee #espresso #coffeemachine #barista #baristalife? http://t.co/ZODcTfP22Z
## 3 658 all 658 The moment you realize your Starbucks barista gave you a regular iced Coffee when u asked 4 decaf. Shitty. Late night not planned.
## 4 931 all 931 Barista made my coffee wrong and still gave me both anyway #Starbucks #coffee #caffeine #upallnight http://t.co/iKCNwO8F6t
## 5 951 all 951 RT @FrankIero: hahaha @jamiasan :*gives Barista our Starbucks order* Barista: coffee? @jamiasan : yes, isn't this is a coffee store?
##
## Match Terms
## ===========
##
## List 1:
## baristakyo, barista, baristalife
##
# Add title
title(main = "Barista Coffee Tweet Associations")
Chapter 3 - Additional text mining (library tm) skills
Simple word clustering - hierarchical clustering and dendrograms (trees):
Getting past single words - considering “not” followed by “good” to have a very specific meaning, rather than just being a sentence containing “not” and “good”:
Different frequency criteria - frequent words can mask insights:
Example code includes:
rain <- data.frame(city=c( 'Cleveland', 'Portland', 'Boston', 'New Orleans' ),
rainfall=c( 39.14, 39.14, 43.77, 62.45 ),
stringsAsFactors=FALSE
)
str(rain)
## 'data.frame': 4 obs. of 2 variables:
## $ city : chr "Cleveland" "Portland" "Boston" "New Orleans"
## $ rainfall: num 39.1 39.1 43.8 62.5
# Create dist_rain
dist_rain <- dist(rain$rainfall)
# View the distance matrix
dist_rain
## 1 2 3
## 2 0.00
## 3 4.63 4.63
## 4 23.31 23.31 18.68
# Create hc
hc <- hclust(dist_rain)
# Plot hc
plot(hc, labels=rain$city)
# NEED TO DOUBLE CHECK EXISTENCE OF tweets_tdm
# Print the dimensions of tweets_tdm
tweets_tdm <- coffee_tdm
dim(tweets_tdm)
## [1] 3075 1000
# Create tdm1
tdm1 <- removeSparseTerms(tweets_tdm, sparse=0.95)
# Create tdm2
tdm2 <- removeSparseTerms(tweets_tdm, sparse=0.975)
# Print tdm1
tdm1
## <<TermDocumentMatrix (terms: 6, documents: 1000)>>
## Non-/sparse entries: 418/5582
## Sparsity : 93%
## Maximal term length: 7
## Weighting : term frequency (tf)
# Print tdm2
tdm2
## <<TermDocumentMatrix (terms: 40, documents: 1000)>>
## Non-/sparse entries: 1646/38354
## Sparsity : 96%
## Maximal term length: 13
## Weighting : term frequency (tf)
# Create tweets_tdm2
tweets_tdm2 <- removeSparseTerms(tweets_tdm, sparse=0.975)
# Create tdm_m
tdm_m <- as.matrix(tweets_tdm2)
# Create tdm_df
tdm_df <- as.data.frame(tdm_m)
# Create tweets_dist
tweets_dist <- dist(tdm_df)
# Create hc
hc <- hclust(tweets_dist)
# Plot the dendrogram
plot(hc)
# Load dendextend
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.5.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:qdap':
##
## %>%
## The following object is masked from 'package:stats':
##
## cutree
# Create hc
hc <- hclust(tweets_dist)
# Create hcd
hcd <- as.dendrogram(hc)
# Print the labels in hcd
labels(hcd)
## [1] "cup" "like" "shop" "looks"
## [5] "show" "hgtv" "renovation" "charlie"
## [9] "hosting" "working" "portland" "movethesticks"
## [13] "whitehurst" "just" "get" "good"
## [17] "morning" "want" "tea" "drinking"
## [21] "can" "starbucks" "think" "iced"
## [25] "half" "chemicals" "cancer" "tested"
## [29] "1000" "single" "need" "ice"
## [33] "much" "amp" "now" "right"
## [37] "love" "make" "dont" "drink"
# Change the branch color to red for "marvin" and "gaye"
hcd <- branches_attr_by_labels(hcd, c("starbucks", "cup"), color="red")
# Plot hcd
plot(hcd)
# Add cluster rectangles
rect.dendrogram(hcd, k=2, border="grey50")
# Create hc
hc <- hclust(tweets_dist)
# Create hcd
hcd <- as.dendrogram(hc)
# Print the labels in hcd
labels(hcd)
## [1] "cup" "like" "shop" "looks"
## [5] "show" "hgtv" "renovation" "charlie"
## [9] "hosting" "working" "portland" "movethesticks"
## [13] "whitehurst" "just" "get" "good"
## [17] "morning" "want" "tea" "drinking"
## [21] "can" "starbucks" "think" "iced"
## [25] "half" "chemicals" "cancer" "tested"
## [29] "1000" "single" "need" "ice"
## [33] "much" "amp" "now" "right"
## [37] "love" "make" "dont" "drink"
# Change the branch color to red for "marvin" and "gaye"
hcd <- branches_attr_by_labels(hcd, c("cup", "just"), color="red")
# Plot hcd
plot(hcd)
# Add cluster rectangles
rect.dendrogram(hcd, k=2, border="grey50")
# Create associations
associations <- findAssocs(tweets_tdm, "venti", 0.2)
# View the venti associations
associations
## $venti
## breve drizzle entire pumps extra cuz forget
## 0.58 0.58 0.58 0.58 0.47 0.41 0.41
## okay hyper mocha vanilla wtf always asleep
## 0.41 0.33 0.33 0.33 0.29 0.26 0.26
## get starbucks white
## 0.25 0.25 0.23
# Create associations_df
associations_df <- list_vect2df(associations)[, 2:3]
# Plot the associations_df values (don't change this)
ggplot(associations_df, aes(y = associations_df[, 1])) +
geom_point(aes(x = associations_df[, 2]),
data = associations_df, size = 3) +
theme_gdocs()
# DOES NOT WORK ON MY MACHINE
# Make tokenizer function
tokenizer <- function(x)
RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))
text_corp <- clean_chardonnay
# Create unigram_dtm
unigram_dtm <- DocumentTermMatrix(text_corp)
# Create bigram_dtm
bigram_dtm <- DocumentTermMatrix(text_corp, control=list(tokenize=tokenizer))
# Examine unigram_dtm
unigram_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 2979)>>
## Non-/sparse entries: 6986/2972014
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Examine bigram_dtm
bigram_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 4812)>>
## Non-/sparse entries: 6680/4805320
## Sparsity : 100%
## Maximal term length: 41
## Weighting : term frequency (tf)
# Create bigram_dtm_m
bigram_dtm_m <- as.matrix(bigram_dtm)
# Create freq
freq <- colSums(bigram_dtm_m)
# Create bi_words
bi_words <- names(freq)
# Examine part of bi_words
bi_words[2577:2587]
## [1] "mean liyah" "meaningless round"
## [3] "means bottles" "measure hamilton"
## [5] "meat piss" "meditation httptcoyjsysbuby"
## [7] "medium finish" "meds rare"
## [9] "meet anybody" "meet lot"
## [11] "meet three"
# Plot a wordcloud
wordcloud(bi_words, freq, max.words=15)
# Create tf_tdm
tf_tdm <- TermDocumentMatrix(text_corp)
# Create tfidf_tdm
tfidf_tdm <- TermDocumentMatrix(text_corp, control=list(weighting = weightTfIdf))
## Warning in weighting(x): empty document(s): 303 480 743
# Create tf_tdm_m
tf_tdm_m <- as.matrix(tf_tdm)
# Create tfidf_tdm_m
tfidf_tdm_m <- as.matrix(tfidf_tdm)
# Examine part of tf_tdm_m
tf_tdm_m[508:509, 5:10]
## Docs
## Terms 5 6 7 8 9 10
## corner 0 0 0 0 0 0
## corriander 0 0 0 0 0 0
# Examine part of tfidf_tdm_m
tf_tdm_m[508:509, 5:10]
## Docs
## Terms 5 6 7 8 9 10
## corner 0 0 0 0 0 0
## corriander 0 0 0 0 0 0
# DO NOT HAVE dataframe tweets
# Add author to custom reading list
custom_reader <- readTabular(mapping = list(content = "text",
id = "num",
author = "screenName",
date = "created"
))
# Make corpus with custom reading
# text_corpus <- VCorpus(DataframeSource(tweets), readerControl = list(reader = custom_reader))
# Clean corpus
# text_corpus <- clean_corpus(text_corpus)
# Print data
# text_corpus[[1]][1]
# Print metadata
# text_corpus[[1]][2]
Chapter 4 - Case study
Amazon vs Google case study - following the six key steps on an HR analytics project:
Step 3: Text Organization - creating an integrated qdapClean function:
Steps 4&5: Feature Extraction and Analysis - for example, sentiment scoring or bi-gram TDM:
Step 6: Reach a conclusion - end of the work flow:
Example code includes:
# Re-creating the data sets available in the case study
test <- read.csv("AmazonGoogleHRData_v001.csv",
stringsAsFactors=FALSE,
na.strings=c("NA", "NA ")
)
amzn <- subset(test, src=="amzn")
goog <- subset(test, src=="goog")
amzn$src <- NULL
goog$src <- NULL
# Print the structure of amzn
str(amzn)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : chr "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " ...
## $ pros : chr "You're surrounded by smart people and the projects are interesting, if a little daunting. " "Brand name is great. Have yet to meet somebody who is unfamiliar with Amazon. Hours weren't as bad as I had previously heard. B"| __truncated__ "Good money.Interaction with some great minds in the world during internal conferences and sessions.Of course the pride of being"| __truncated__ "nice pay and overtime and different shifts " ...
## $ cons : chr "Internal tools proliferation has created a mess for trying to get to basic information. Most people are required to learn/under"| __truncated__ "not the most stimulating work. Good brand name to work for but the work itself is mundane as it can get. As a financial analyst"| __truncated__ "No proper growth plan for employees.Difficult promotion process requiring a lot more documentation than your actual deliverable"| __truncated__ "didn't last quite long enough " ...
# Create amzn_pros
amzn_pros <- amzn$pros
# Create amzn_cons
amzn_cons <- amzn$cons
# Print the structure of goog
str(goog)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : chr "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " ...
## $ pros : chr "* If you're a software engineer, you're among the kings of the hill at Google. It's an engineer-driven company without a doubt "| __truncated__ "1) Food, food, food. 15+ cafes on main campus (MTV) alone. Mini-kitchens, snacks, drinks, free breakfast/lunch/dinner, all day,"| __truncated__ "You can't find a more well-regarded company that actually deserves the hype it gets. " "- you drive yourself here. If you want to grow, you have to seek out opportunities and prove that your worth. This keeps you mo"| __truncated__ ...
## $ cons : chr "* It *is* becoming larger, and with it comes growing pains: bureaucracy, slow to respond to market threats, bloated teams, cros"| __truncated__ "1) Work/life balance. What balance? All those perks and benefits are an illusion. They keep you at work and they help you to be"| __truncated__ "I live in SF so the commute can take between 1.5 hours to 1.75 hours each way on the shuttle - sometimes 2 hours each way on a "| __truncated__ "- Google is a big company. So there are going to be winners and losers when it comes to career growth. Due to the high hiring b"| __truncated__ ...
# Create goog_pros
goog_pros <- goog$pros
# Create goog_cons
goog_cons <- goog$cons
qdap_clean <- function(x){
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
tm_clean <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Google", "Amazon", "company"))
return(corpus)
}
# Alter amzn_pros
amzn_pros <- qdap_clean(amzn_pros)
# Alter amzn_cons
amzn_cons <- qdap_clean(amzn_cons)
# Create az_p_corp
az_p_corp <- VCorpus(VectorSource(amzn_pros[complete.cases(amzn_pros)]))
# Create az_c_corp
az_c_corp <- VCorpus(VectorSource(amzn_cons[complete.cases(amzn_cons)]))
# Create amzn_pros_corp
amzn_pros_corp <- tm_clean(az_p_corp)
# Create amzn_cons_corp
amzn_cons_corp <- tm_clean(az_c_corp)
# Apply qdap_clean to goog_pros
goog_pros <- qdap_clean(goog_pros)
# Apply qdap_clean to goog_cons
goog_cons <- qdap_clean(goog_cons)
# Create goog_p_corp
# complete.cases() to avoid the NA problem in RWeka::NGramTokenizer
goog_p_corp <- VCorpus(VectorSource(goog_pros[complete.cases(goog_pros)]))
# Create goog_c_corp
# complete.cases() to avoid the NA problem in RWeka::NGramTokenizer
goog_c_corp <- VCorpus(VectorSource(goog_cons[complete.cases(goog_cons)]))
# Create goog_pros_corp
goog_pros_corp <- tm_clean(goog_p_corp)
# Create goog_cons_corp
goog_cons_corp <- tm_clean(goog_c_corp)
# DOES NOT WORK ON MY MACHINE (needed the complete.cases() fix above - seems to struggle with NA data)
tokenizer <- function(x) { RWeka::NGramTokenizer(x, RWeka::Weka_control(min=2, max=2)) }
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(amzn_pros_corp, control=list(tokenize=tokenizer))
# Create amzn_p_tdm_m
amzn_p_tdm_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_tdm_m)
# Plot a wordcloud using amzn_p_freq values
wordcloud(names(amzn_p_freq), amzn_p_freq, max.words=25, color="blue")
## Warning in wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color
## = "blue"): good benefits could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color
## = "blue"): fast paced could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color
## = "blue"): smart people could not be fit on page. It will not be plotted.
# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(amzn_cons_corp, control=list(tokenize=tokenizer))
# Create amzn_c_tdm_m
amzn_c_tdm_m <- as.matrix(amzn_c_tdm)
# Create amzn_c_freq
amzn_c_freq <- rowSums(amzn_c_tdm_m)
# Plot a wordcloud of negative Amazon bigrams
wordcloud(names(amzn_c_freq), amzn_c_freq, max.words=25, color="red")
# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(amzn_cons_corp, control=list(tokenize=tokenizer))
# Print amzn_c_tdm to the console
amzn_c_tdm
## <<TermDocumentMatrix (terms: 4777, documents: 494)>>
## Non-/sparse entries: 5217/2354621
## Sparsity : 100%
## Maximal term length: 31
## Weighting : term frequency (tf)
# Create amzn_c_tdm2 by removing sparse terms
amzn_c_tdm2 <- removeSparseTerms(amzn_c_tdm, sparse=0.993)
# Create hc as a cluster of distance values
hc <- hclust(dist(amzn_c_tdm2, method="euclidean"), method="complete")
# Produce a plot of hc
plot(hc)
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(amzn_pros_corp, control=list(tokenize=tokenizer))
# Create amzn_p_m
amzn_p_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_m)
# Create term_frequency
term_frequency <- sort(amzn_p_freq, decreasing=TRUE)
# Print the 5 most common terms
term_frequency[1:5]
## good pay great benefits smart people place work fast paced
## 25 24 20 17 16
# Find associations with fast paced
findAssocs(amzn_p_tdm, "fast paced", 0.2)
## $`fast paced`
## paced environment environments ever learn fast
## 0.49 0.35 0.35
## paced friendly paced work able excel
## 0.35 0.35 0.25
## activity ample advance one also well
## 0.25 0.25 0.25
## amazon fast amazon noting amazon one
## 0.25 0.25 0.25
## amount time ample opportunity assistance ninety
## 0.25 0.25 0.25
## benefits including break computer call activity
## 0.25 0.25 0.25
## can choose catchy cheers center things
## 0.25 0.25 0.25
## challenging expect cheers opportunity choose success
## 0.25 0.25 0.25
## combined encouragement competitive environments computer room
## 0.25 0.25 0.25
## cool things deliver results dock makes
## 0.25 0.25 0.25
## driven deliver easy learn emphasis shipping
## 0.25 0.25 0.25
## encouragement innovation environment benefits environment catchy
## 0.25 0.25 0.25
## environment center environment fast environment help
## 0.25 0.25 0.25
## environment smart ever known ever witnessed
## 0.25 0.25 0.25
## everchanging fast everyones preferences excel advance
## 0.25 0.25 0.25
## excel everchanging exciting environment expect learn
## 0.25 0.25 0.25
## extremely fast facility top fail successful
## 0.25 0.25 0.25
## fantastic able fired part five percent
## 0.25 0.25 0.25
## freindly place friendly atmosphere friendly management
## 0.25 0.25 0.25
## full medical get fired go extremely
## 0.25 0.25 0.25
## great plenty great teamwork happening technology
## 0.25 0.25 0.25
## hassle benefits help get help workers
## 0.25 0.25 0.25
## high quality high volume including full
## 0.25 0.25 0.25
## innovation owning job requirements leader can
## 0.25 0.25 0.25
## line break lot responsibility maintaining high
## 0.25 0.25 0.25
## makes time management nice nice facility
## 0.25 0.25 0.25
## ninety five noting short offers opportunity
## 0.25 0.25 0.25
## one competitive one fast opportunity overtime
## 0.25 0.25 0.25
## opportunity yell ownership fast owning work
## 0.25 0.25 0.25
## paced emphasis paced exciting paced high
## 0.25 0.25 0.25
## paced never paced rewarding paced ship
## 0.25 0.25 0.25
## paced software paid upfront people focused
## 0.25 0.25 0.25
## percent paid plenty shifts position fast
## 0.25 0.25 0.25
## possible still preferences fast products quickly
## 0.25 0.25 0.25
## quality bar quickly possible readily available
## 0.25 0.25 0.25
## requirements easy responsibility ownership results great
## 0.25 0.25 0.25
## results team rewarding people shifts everyones
## 0.25 0.25 0.25
## ship dock shipping products short amount
## 0.25 0.25 0.25
## short fantastic smart coworkers still maintaining
## 0.25 0.25 0.25
## success fail successful also team driven
## 0.25 0.25 0.25
## technology today things happening things lot
## 0.25 0.25 0.25
## time fast time go top line
## 0.25 0.25 0.25
## upfront experience vision well volume call
## 0.25 0.25 0.25
## well rewarded well tuition witnessed combined
## 0.25 0.25 0.25
## work can work cool work environments
## 0.25 0.25 0.25
## work fast work job workers readily
## 0.25 0.25 0.25
## yell leader
## 0.25
# DO NOT HAVE FILE all_goog_corp
# Created below
goog_df <- data.frame(pros=goog_pros, cons=goog_cons)
goog_df <- goog_df[complete.cases(goog_df), ]
str(goog_df)
## 'data.frame': 499 obs. of 2 variables:
## $ pros: Factor w/ 491 levels "- access to a vast wealth of technical resources and people",..: 20 354 485 12 409 227 412 375 308 383 ...
## $ cons: Factor w/ 489 levels "- bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow innovation and d"| __truncated__,..: 17 308 170 6 289 56 451 445 180 107 ...
goog_vec <- c(paste(goog_df$pros, collapse=" "),
paste(goog_df$cons, collapse=" ")
)
all_goog_corpus <- VCorpus(VectorSource(goog_vec))
# Create all_goog_corp
all_goog_corp <- tm_clean(all_goog_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_goog_corp)
# Name the columns of all_tdm
colnames(all_tdm) <- c("Goog_Pros", "Goog_Cons")
# Create all_m
all_m <- as.matrix(all_tdm)
# Build a comparison cloud
comparison.cloud(all_m, max.words=100, colors=c("#F44336", "#2196f3"))
# DO NOT HAVE - THIS IS THE ALL POSITIVE ASSOCIATIONS
# Created below
goog_p_tdm <- TermDocumentMatrix(goog_pros_corp, control=list(tokenize=tokenizer))
goog_p_tdm_m <- as.matrix(goog_p_tdm)
goog_p_freq <- rowSums(goog_p_tdm_m)
all_tdm_df <- merge(y=data.frame(keyWord=names(goog_p_freq), googNum=goog_p_freq, stringsAsFactors=FALSE),
x=data.frame(keyWord=names(amzn_p_freq), amznNum=amzn_p_freq, stringsAsFactors=FALSE),
by="keyWord", all=TRUE
)
all_tdm_df[is.na(all_tdm_df)] <- 0
all_tdm_m <- as.matrix(all_tdm_df[, -1])
rownames(all_tdm_m) <- all_tdm_df$keyWord
# Create common_words
common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 2] - common_words[, 1])
# Add difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing=TRUE), ]
# Create top15_df
top15_df <- data.frame(x=common_words[1:15, 1], y=common_words[1:15, 2], labels=rownames(common_words)[1:15])
# Create the pyramid plot
plotrix::pyramid.plot(top15_df$x, top15_df$y,
labels=top15_df$labels, gap = 12,
top.labels = c("Amzn", "Pro Words", "Google"),
main = "Words in Common", unit = NULL
)
## [1] 5.1 4.1 4.1 2.1
# DO NOT HAVE - THIS IS THE ALL NEGATIVE ASSOCIATIONS
# Created below
goog_c_tdm <- TermDocumentMatrix(goog_cons_corp, control=list(tokenize=tokenizer))
goog_c_tdm_m <- as.matrix(goog_c_tdm)
goog_c_freq <- rowSums(goog_c_tdm_m)
all_tdm_df <- merge(y=data.frame(keyWord=names(goog_c_freq), googNum=goog_c_freq, stringsAsFactors=FALSE),
x=data.frame(keyWord=names(amzn_c_freq), amznNum=amzn_c_freq, stringsAsFactors=FALSE),
by="keyWord", all=TRUE
)
all_tdm_df[is.na(all_tdm_df)] <- 0
all_tdm_m <- as.matrix(all_tdm_df[, -1])
rownames(all_tdm_m) <- all_tdm_df$keyWord
# Create common_words
common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 2] - common_words[, 1])
# Bind difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing=TRUE), ]
# Create top15_df
top15_df <- data.frame(x=common_words[1:15, 1], y=common_words[1:15, 2], labels=rownames(common_words)[1:15])
# Create the pyramid plot
plotrix::pyramid.plot(top15_df$x, top15_df$y,
labels=top15_df$labels, gap = 12,
top.labels = c("Amzn", "Cons Words", "Google"),
main = "Words in Common", unit = NULL
)
## [1] 4 2 4 2
R is a programming language, while RStudio is a company that created an IDE for R:
Install R and Rstudio - both are free downloads:
Rstudio panes - Console, Environment, File/Plot/Package/Help:
Source pane - good place to write multi-line code prior to running it in the console:
The View() function is the data viewer - just run it at the console, with the name of a frame inside:
Environment pane is in the upper-right corner, keeping track of the R session:
History tab is next to the environment tab, as part of the Environment pane:
Files pane is in the bottom-right corner of the default Rstudio layout:
Plots pane and packages tab - the lower-right series of panes:
Help pane displays the help pages for R objects:
Viewer tab is also in the lower-right pane, and shows html output (if any) produced during the session:
Coding features - when writing in the Source pane for a .R object in Rstudio, R-specific coding and extensions are applied:
Coding diagnostics - Rstudio flags potential errors in the code prior to the code running:
Keyboard shortcuts help save time while writing code in the Source pane:
Multiple cursors can be created within the editor using CTRL-ALT-
Navigate and edit code using SHIFT-ALT-G to jump to any line in the document:
Run scripts frequently to help with checking and debugging code:
Traceback helps to debug errors that occur when you run the code:
Debugger mode is a way of pausing time - run one line, then see how Rstudio sees the code and environment and variables at that specific moment:
Debugger mode: breakpoints can help find what is going wrong, even if the code is not throwing a formal error (wrong result, rather than code bombs out):
Rstudio project for navigating between projects:
Populating projects - assuming starting with File/New Project/New Directory/Empty Project:
Packrat allows for using different versions of a package for different projects (useful for reproducible research):
Introduction to R packages - best way to share functions, vignettes, and the like:
Import and load source files - can add files from the “Add” tab of the File/New Project/New Directory/R Package process:
Package documentation (Part I) - R documentation files have a special format and are saved as .Rd files:
Package documentation (Part II) - filling in the skeleton that roxygen has created:
Package documentation (Part III) - Build/More/Document:
Test packages (Part I) - make sure that all of the functions work, including cross-function dependencies:
Test packages (Part II) - create tests by saving a new script to the test/testthat directory:
Test packages (Part III) - use Build/Test Package from Environment/History tab (CTRL-SHIFT-T works also):
Check packages is an optional component of the package building process:
Build packages - R converts to a single compressed file (.tar.gz) which is known as a “tarball”:
Chapter 2 - Version Control
Introduction to Git (available in Rstudio, along with SVN, to help with collaboration and version control):
Stage and commit - using the Git tab in the Environment/History pane:
Using .gitignore - telling Git that certain files should not be flagged as having differences from the “official version”:
Git Icons - example:
Commit history - accessed through the “Commit” tab of the “History” window:
Undo committed changes: checkout (Git equivalent to the “Undo” button in some other softwares):
Undo committed changes - returning only to the previously committed file does not require a “checkout”:
Introduction to GitHub - the github.com website allows for keeping copies in the cloud, even as collaborator work off-line:
Pull and push - additional layer of complexity that github.com adds to Git:
Chapter 3 - Reporting
Tools for reporting - sharing results to a wider audience (clients, collaborators, etc.):
Introduction to R Markdown - creating all of the code for a reproducible research, plus all of the supporting text:
R Markdown in Rstudio - integration by way of the .Rmd in the source pane:
Rendering R Markdown - available through a GUI system in Rstudio:
Compile notebook - can convert any R Script document to R Markdown using File/Compile Notebook:
Rstudio LaTeX editor - common format used in match and science departments for reporting:
Shiny applications can easily be written, tested, and run using Rstudio:
Publish Shiny apps - place finalized apps on-line:
Robert Muenchen - author of “R for SAS and SPSS Users” and “R for Stata Users”:
Example code includes:
utils::demo("graphics") # nice example of plots and data
##
##
## demo(graphics)
## ---- ~~~~~~~~
##
## > # Copyright (C) 1997-2009 The R Core Team
## >
## > require(datasets)
##
## > require(grDevices); require(graphics)
##
## > ## Here is some code which illustrates some of the differences between
## > ## R and S graphics capabilities. Note that colors are generally specified
## > ## by a character string name (taken from the X11 rgb.txt file) and that line
## > ## textures are given similarly. The parameter "bg" sets the background
## > ## parameter for the plot and there is also an "fg" parameter which sets
## > ## the foreground color.
## >
## >
## > x <- stats::rnorm(50)
##
## > opar <- par(bg = "white")
##
## > plot(x, ann = FALSE, type = "n")
##
## > abline(h = 0, col = gray(.90))
##
## > lines(x, col = "green4", lty = "dotted")
##
## > points(x, bg = "limegreen", pch = 21)
##
## > title(main = "Simple Use of Color In a Plot",
## + xlab = "Just a Whisper of a Label",
## + col.main = "blue", col.lab = gray(.8),
## + cex.main = 1.2, cex.lab = 1.0, font.main = 4, font.lab = 3)
##
## > ## A little color wheel. This code just plots equally spaced hues in
## > ## a pie chart. If you have a cheap SVGA monitor (like me) you will
## > ## probably find that numerically equispaced does not mean visually
## > ## equispaced. On my display at home, these colors tend to cluster at
## > ## the RGB primaries. On the other hand on the SGI Indy at work the
## > ## effect is near perfect.
## >
## > par(bg = "gray")
##
## > pie(rep(1,24), col = rainbow(24), radius = 0.9)
##
## > title(main = "A Sample Color Wheel", cex.main = 1.4, font.main = 3)
##
## > title(xlab = "(Use this as a test of monitor linearity)",
## + cex.lab = 0.8, font.lab = 3)
##
## > ## We have already confessed to having these. This is just showing off X11
## > ## color names (and the example (from the postscript manual) is pretty "cute".
## >
## > pie.sales <- c(0.12, 0.3, 0.26, 0.16, 0.04, 0.12)
##
## > names(pie.sales) <- c("Blueberry", "Cherry",
## + "Apple", "Boston Cream", "Other", "Vanilla Cream")
##
## > pie(pie.sales,
## + col = c("purple","violetred1","green3","cornsilk","cyan","white"))
##
## > title(main = "January Pie Sales", cex.main = 1.8, font.main = 1)
##
## > title(xlab = "(Don't try this at home kids)", cex.lab = 0.8, font.lab = 3)
##
## > ## Boxplots: I couldn't resist the capability for filling the "box".
## > ## The use of color seems like a useful addition, it focuses attention
## > ## on the central bulk of the data.
## >
## > par(bg="cornsilk")
##
## > n <- 10
##
## > g <- gl(n, 100, n*100)
##
## > x <- rnorm(n*100) + sqrt(as.numeric(g))
##
## > boxplot(split(x,g), col="lavender", notch=TRUE)
##
## > title(main="Notched Boxplots", xlab="Group", font.main=4, font.lab=1)
##
## > ## An example showing how to fill between curves.
## >
## > par(bg="white")
##
## > n <- 100
##
## > x <- c(0,cumsum(rnorm(n)))
##
## > y <- c(0,cumsum(rnorm(n)))
##
## > xx <- c(0:n, n:0)
##
## > yy <- c(x, rev(y))
##
## > plot(xx, yy, type="n", xlab="Time", ylab="Distance")
##
## > polygon(xx, yy, col="gray")
##
## > title("Distance Between Brownian Motions")
##
## > ## Colored plot margins, axis labels and titles. You do need to be
## > ## careful with these kinds of effects. It's easy to go completely
## > ## over the top and you can end up with your lunch all over the keyboard.
## > ## On the other hand, my market research clients love it.
## >
## > x <- c(0.00, 0.40, 0.86, 0.85, 0.69, 0.48, 0.54, 1.09, 1.11, 1.73, 2.05, 2.02)
##
## > par(bg="lightgray")
##
## > plot(x, type="n", axes=FALSE, ann=FALSE)
##
## > usr <- par("usr")
##
## > rect(usr[1], usr[3], usr[2], usr[4], col="cornsilk", border="black")
##
## > lines(x, col="blue")
##
## > points(x, pch=21, bg="lightcyan", cex=1.25)
##
## > axis(2, col.axis="blue", las=1)
##
## > axis(1, at=1:12, lab=month.abb, col.axis="blue")
##
## > box()
##
## > title(main= "The Level of Interest in R", font.main=4, col.main="red")
##
## > title(xlab= "1996", col.lab="red")
##
## > ## A filled histogram, showing how to change the font used for the
## > ## main title without changing the other annotation.
## >
## > par(bg="cornsilk")
##
## > x <- rnorm(1000)
##
## > hist(x, xlim=range(-4, 4, x), col="lavender", main="")
##
## > title(main="1000 Normal Random Variates", font.main=3)
##
## > ## A scatterplot matrix
## > ## The good old Iris data (yet again)
## >
## > pairs(iris[1:4], main="Edgar Anderson's Iris Data", font.main=4, pch=19)
##
## > pairs(iris[1:4], main="Edgar Anderson's Iris Data", pch=21,
## + bg = c("red", "green3", "blue")[unclass(iris$Species)])
##
## > ## Contour plotting
## > ## This produces a topographic map of one of Auckland's many volcanic "peaks".
## >
## > x <- 10*1:nrow(volcano)
##
## > y <- 10*1:ncol(volcano)
##
## > lev <- pretty(range(volcano), 10)
##
## > par(bg = "lightcyan")
##
## > pin <- par("pin")
##
## > xdelta <- diff(range(x))
##
## > ydelta <- diff(range(y))
##
## > xscale <- pin[1]/xdelta
##
## > yscale <- pin[2]/ydelta
##
## > scale <- min(xscale, yscale)
##
## > xadd <- 0.5*(pin[1]/scale - xdelta)
##
## > yadd <- 0.5*(pin[2]/scale - ydelta)
##
## > plot(numeric(0), numeric(0),
## + xlim = range(x)+c(-1,1)*xadd, ylim = range(y)+c(-1,1)*yadd,
## + type = "n", ann = FALSE)
##
## > usr <- par("usr")
##
## > rect(usr[1], usr[3], usr[2], usr[4], col="green3")
##
## > contour(x, y, volcano, levels = lev, col="yellow", lty="solid", add=TRUE)
##
## > box()
##
## > title("A Topographic Map of Maunga Whau", font= 4)
##
## > title(xlab = "Meters North", ylab = "Meters West", font= 3)
##
## > mtext("10 Meter Contour Spacing", side=3, line=0.35, outer=FALSE,
## + at = mean(par("usr")[1:2]), cex=0.7, font=3)
##
## > ## Conditioning plots
## >
## > par(bg="cornsilk")
##
## > coplot(lat ~ long | depth, data = quakes, pch = 21, bg = "green3")
##
## > par(opar)
Chapter 2 - Installing and Maintaining R
Installation typically includes both R (www.r-project.org) and R Studio (rstudio.com):
Chapter 3 - Help and Documentation
R Help can be accessed in several ways - help(myFunction) or ?myFunction or ??myFunction:
Chapter 4 - R Studio Basics
R Studio typically has four windows/consoles to work in:
Chapter 5 - Programming Language Basics
Programming Language Basics - R is an Object Oriented Language:
Parentheses and Braces:
Chapter 6 - Data Structures
Introduction to data structures - R has vectors, factors, data frames, arrays, lists, etc., and not just “the data set”:
Obtaining information from vectors:
Factors (categorical variables) and labels:
Data Frames are the closest equivalent to the dataset in other languages:
Matrices and lists:
Example code includes:
# The gender vector
gender <- c("f", "f", "f", NA, "m", "m", "m", "m")
# Create a factor with the labels "Female" and "Male" and print the result
gender <- factor(gender, levels=c("f", "m"), labels=c("Female", "Male"))
gender
## [1] Female Female Female <NA> Male Male Male Male
## Levels: Female Male
# The q1 vector
q1 <- c(1, 2, 2, 3, 4, 5, 5, 4)
# Select the scores of the females
q1[ gender == "Female" ]
## [1] 1 2 2 NA
# Our data so far:
# The vector country
country <- c(1, 2, 1, 2, 1, 2, 1, 2)
# The period vector
period <- c("bc", "bc", "bc", NA, "ac", "ac", "ac", "ac")
# Business hours quarter 1, 2, 3 and 4
QR1 <- c(36, 34, 37, 35, 33, 32, 35, 31)
QR2 <- c(37, 35, 38, 36, 35, 33, 35, 33)
QR3 <- c(39, 37, 40, NA, 36, 35, 37, 35)
QR4 <- c(36, 34, 37, 35, 34, 32, 36, 32)
# Create a data frame of the data of so far and assign it to 'company_data'.
company_data <- data.frame(country, period, QR1, QR2, QR3, QR4, stringsAsFactors=FALSE)
# Print the data frame
company_data
## country period QR1 QR2 QR3 QR4
## 1 1 bc 36 37 39 36
## 2 2 bc 34 35 37 34
## 3 1 bc 37 38 40 37
## 4 2 <NA> 35 36 NA 35
## 5 1 ac 33 35 36 34
## 6 2 ac 32 33 35 32
## 7 1 ac 35 35 37 36
## 8 2 ac 31 33 35 32
mymatrix <- matrix( c(36, 34, 37, 35, 33, 32, 35, 31, 37, 35, 38, 36, 35, 33, 35, 33, 39, 37, 40, NA, 36, 35, 37, 35, 36, 34, 37, 35, 34, 32, 36, 32), nrow=8, ncol=4)
# Construct the same matrix as mymatrix by using the vectors QR1, QR2, QR3 and QR4 and assign to same_matrix.
same_matrix <- cbind(QR1, QR2, QR3, QR4)
# Compute the correlation between the columns of same_matrix by using pairwise deletion of missing values
cor(same_matrix, use="pairwise")
## QR1 QR2 QR3 QR4
## QR1 1.0000000 0.9531986 0.9669876 0.9686649
## QR2 0.9531986 1.0000000 0.9803486 0.9244735
## QR3 0.9669876 0.9803486 1.0000000 0.9193967
## QR4 0.9686649 0.9244735 0.9193967 1.0000000
Chapter 7 - Managing Files and Workspace
Manipulating objects - most languages require operating system commands:
Managing workspace - workind directory is where files will be read/written by default:
Example code includes:
# List all objects that are stored in the workspace.
ls()
## [1] "a_data_frame" "a_factor"
## [3] "a_fancy_microwave" "a_high_end_microwave"
## [5] "a_linear_model" "a_microwave_oven"
## [7] "a_numeric_vector" "acc"
## [9] "acc_full" "acc_g"
## [11] "acc_i" "acc_small"
## [13] "accs" "ads"
## [15] "airquality" "all_chardonnay"
## [17] "all_clean" "all_coffee"
## [19] "all_cols" "all_corpus"
## [21] "all_goog_corp" "all_goog_corpus"
## [23] "all_m" "all_tdm"
## [25] "all_tdm_df" "all_tdm_m"
## [27] "all_tweets" "amzn"
## [29] "amzn_c_freq" "amzn_c_tdm"
## [31] "amzn_c_tdm_m" "amzn_c_tdm2"
## [33] "amzn_cons" "amzn_cons_corp"
## [35] "amzn_p_freq" "amzn_p_m"
## [37] "amzn_p_tdm" "amzn_p_tdm_m"
## [39] "amzn_pros" "amzn_pros_corp"
## [41] "another_microwave_oven" "ascii_pizza_slice"
## [43] "assigned_microwave_oven" "associations"
## [45] "associations_df" "atmos"
## [47] "az_c_corp" "az_p_corp"
## [49] "bbbDescr" "bi_words"
## [51] "bigram_dtm" "bigram_dtm_m"
## [53] "blackChess" "bloodbrain_x"
## [55] "bloodbrain_x_small" "bloodbrain_y"
## [57] "Boston" "BostonHousing"
## [59] "breast_cancer_x" "breast_cancer_y"
## [61] "cars" "cgdp"
## [63] "cgdp_afg" "chardonnay_corp"
## [65] "chardonnay_freqs" "chardonnay_m"
## [67] "chardonnay_source" "chardonnay_tdm"
## [69] "chardonnay_tweets" "chardonnay_words"
## [71] "chess" "choco_data"
## [73] "churn_x" "churn_y"
## [75] "churnTest" "churnTrain"
## [77] "clean_chardonnay" "clean_corp"
## [79] "clean_corpus" "cloned_microwave_oven"
## [81] "coffee_corpus" "coffee_dtm"
## [83] "coffee_m" "coffee_source"
## [85] "coffee_tdm" "coffee_tweets"
## [87] "common_words" "comp"
## [89] "comp_dict" "compA"
## [91] "company_data" "compB"
## [93] "complete_text" "complicate"
## [95] "conf" "conf_full"
## [97] "conf_g" "conf_i"
## [99] "conf_small" "country"
## [101] "crime_data" "crime_data_sc"
## [103] "crime_km" "crime_single"
## [105] "curCol" "curMeans"
## [107] "curRow" "curSD"
## [109] "custom_reader" "cylSplit"
## [111] "data.dist" "data.scaled"
## [113] "days" "desMeans"
## [115] "desSD" "df_corpus"
## [117] "df_source" "diagnosis"
## [119] "diamonds" "difference"
## [121] "dist_matrix" "dist_rain"
## [123] "do_math" "draw_roc_lines"
## [125] "dunn_complete" "dunn_km"
## [127] "dunn_km_sc" "dunn_single"
## [129] "e" "eachState"
## [131] "emails" "emails_full"
## [133] "emails_small" "env"
## [135] "env_microwave_oven_factory" "env2"
## [137] "error" "example_kelvin"
## [139] "example_text" "fancy_microwave_oven_factory"
## [141] "fancy_microwave_power_rating" "FN"
## [143] "foo" "FP"
## [145] "freq" "frequency"
## [147] "frequency2" "funDummy"
## [149] "future_days" "g"
## [151] "gender" "get_n_elements"
## [153] "get_n_elements.data.frame" "get_n_elements.default"
## [155] "goog" "goog_c_corp"
## [157] "goog_c_freq" "goog_c_tdm"
## [159] "goog_c_tdm_m" "goog_cons"
## [161] "goog_cons_corp" "goog_df"
## [163] "goog_p_corp" "goog_p_freq"
## [165] "goog_p_tdm" "goog_p_tdm_m"
## [167] "goog_pros" "goog_pros_corp"
## [169] "goog_vec" "hc"
## [171] "hcd" "hclust.average"
## [173] "hclust.complete" "hclust.out"
## [175] "hclust.pokemon" "hclust.single"
## [177] "high_end_microwave_oven_factory" "i"
## [179] "idxTrain" "indices"
## [181] "intCtr" "inv"
## [183] "iris" "k"
## [185] "kang_nose" "keyIdx"
## [187] "keyNames" "keyStateNames"
## [189] "keyStateNums" "kitty"
## [191] "km.out" "km_cars"
## [193] "km_seeds" "kmeans_iris"
## [195] "knn_test" "knn_train"
## [197] "last_5" "lastChristmasNoon"
## [199] "lev" "linkedin"
## [201] "linkedin_lm" "linkedin_pred"
## [203] "listA" "lm_choco"
## [205] "lm_kang" "lm_shop"
## [207] "lm_wage" "lm_wb"
## [209] "lm_wb_log" "logBBB"
## [211] "lst" "lst2"
## [213] "max_age" "max_class"
## [215] "me" "means"
## [217] "memb_complete" "memb_single"
## [219] "microwave_oven" "microwave_oven_factory"
## [221] "microwave_power_rating" "min_age"
## [223] "min_class" "mod"
## [225] "mod2" "mod3"
## [227] "model" "model_glmnet"
## [229] "model_list" "model_rf"
## [231] "model1" "model2"
## [233] "mpgRange" "mpgScale"
## [235] "mtcars" "mtxTest"
## [237] "my_class" "my_iris"
## [239] "my_knn" "myControl"
## [241] "myFolds" "mymatrix"
## [243] "myVal" "myWords"
## [245] "n" "n_elements_ability.cov"
## [247] "n_elements_sleep" "n_smart"
## [249] "new_stops" "new_text"
## [251] "nextUMHomeGame" "nms"
## [253] "nose_length_est" "nose_width_new"
## [255] "nRed" "nWhite"
## [257] "opar" "other_199"
## [259] "p" "p_class"
## [261] "period" "pie.sales"
## [263] "pin" "pokemon"
## [265] "pokemon.scaled" "pokeTotal"
## [267] "poss_log10" "pr.out"
## [269] "pr.var" "pr.with.scaling"
## [271] "pr.without.scaling" "prec"
## [273] "pred" "pred_full"
## [275] "pred_g" "pred_i"
## [277] "pretty_titles" "prevData"
## [279] "previous_4" "prop_less"
## [281] "pruned" "purple_orange"
## [283] "pve" "q1"
## [285] "qdap_clean" "QR1"
## [287] "QR2" "QR3"
## [289] "QR4" "r_sq"
## [291] "rain" "range"
## [293] "ranks" "ratio_ss"
## [295] "rawTweets" "rec"
## [297] "redWine" "remove_cols"
## [299] "res" "res_test"
## [301] "resamps" "rightNow"
## [303] "rmse" "rmse_test"
## [305] "rmse_train" "rows"
## [307] "run_complete" "run_dist"
## [309] "run_km" "run_km_sc"
## [311] "run_record" "run_record_sc"
## [313] "run_single" "safe_log10"
## [315] "sales" "same_matrix"
## [317] "sampMsg" "scale"
## [319] "school_km" "school_result"
## [321] "seeds" "seeds_km"
## [323] "seeds_km_1" "seeds_km_2"
## [325] "seeds_type" "shop_data"
## [327] "shop_new" "shuffled"
## [329] "size_dist" "some_vars"
## [331] "Sonar" "spam"
## [333] "spam_classifier" "spam_pred"
## [335] "species" "split"
## [337] "sq_ft" "ss_res"
## [339] "ss_tot" "stateCols"
## [341] "stateDF" "stem_doc"
## [343] "tdm_df" "tdm_m"
## [345] "tdm1" "tdm2"
## [347] "term_count" "term_frequency"
## [349] "test" "test_labels"
## [351] "test_output" "test_output_knn"
## [353] "test_output_lm" "test_output_lm_log"
## [355] "text" "text_corp"
## [357] "tf_tdm" "tf_tdm_m"
## [359] "tfidf_tdm" "tfidf_tdm_m"
## [361] "titanic" "titanic_train"
## [363] "tm_clean" "TN"
## [365] "tokenizer" "top_grades"
## [367] "top15_df" "top25_df"
## [369] "TP" "train"
## [371] "train_indices" "train_labels"
## [373] "tree" "tree_g"
## [375] "tree_i" "trIdx"
## [377] "tweets_dist" "tweets_tdm"
## [379] "tweets_tdm2" "type_info"
## [381] "unigram_dtm" "unseen"
## [383] "urb_pop" "url"
## [385] "usr" "v1"
## [387] "v2" "v3"
## [389] "v4" "v5"
## [391] "v6" "vec_corpus"
## [393] "vec_source" "Wage"
## [395] "what_am_i" "what_am_i.cat"
## [397] "what_am_i.character" "what_am_i.mammal"
## [399] "whiteChess" "whiteWine"
## [401] "wine" "wisc.data"
## [403] "wisc.df" "wisc.hclust"
## [405] "wisc.hclust.clusters" "wisc.km"
## [407] "wisc.pr" "wisc.pr.hclust"
## [409] "wisc.pr.hclust.clusters" "word_freqs"
## [411] "world_bank_test" "world_bank_test_input"
## [413] "world_bank_test_output" "world_bank_test_truth"
## [415] "world_bank_train" "worst_grades"
## [417] "wss" "x"
## [419] "xadd" "xdelta"
## [421] "xFactorNon" "xFactorOrder"
## [423] "xRaw" "xscale"
## [425] "xx" "y"
## [427] "yadd" "ydelta"
## [429] "year" "yscale"
## [431] "yy"
# List all objects in the workspace with a "q" in their name.
ls(pattern = "q")
## [1] "airquality" "amzn_c_freq" "amzn_p_freq"
## [4] "chardonnay_freqs" "freq" "frequency"
## [7] "frequency2" "goog_c_freq" "goog_p_freq"
## [10] "q1" "qdap_clean" "r_sq"
## [13] "sq_ft" "term_frequency" "word_freqs"
# The workshop and businesshours data frame are already loaded in your workspace
businesshours <- company_data
workshop <- data.frame(workshop=c(1, 2, 1, 2, 1, 2, 1, 2),
gender=c("f", "f", "f", NA, "m", "m", "m", "m"),
q1=c(1, 2, 2, 3, 4, 5, 5, 4), q2=c(1, 1, 2, 1, 5, 4, 3, 5),
q3=c(5, 4, 4, NA, 2, 5, 4, 5), q4=c(1, 1, 3, 3, 4, 5, 4, 5)
)
# Have a look at the first three rows of the `workshop` factor.
head(workshop, n=3)
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
# Have a look at the structure part of the `workshop` factor.
str(workshop)
## 'data.frame': 8 obs. of 6 variables:
## $ workshop: num 1 2 1 2 1 2 1 2
## $ gender : Factor w/ 2 levels "f","m": 1 1 1 NA 2 2 2 2
## $ q1 : num 1 2 2 3 4 5 5 4
## $ q2 : num 1 1 2 1 5 4 3 5
## $ q3 : num 5 4 4 NA 2 5 4 5
## $ q4 : num 1 1 3 3 4 5 4 5
# Have a look at the last four rows of the `businesshours` data frame.
tail(businesshours, n=4)
## country period QR1 QR2 QR3 QR4
## 5 1 ac 33 35 36 34
## 6 2 ac 32 33 35 32
## 7 1 ac 35 35 37 36
## 8 2 ac 31 33 35 32
# Have a look at the attributes of the `businesshours` data frame.
attributes(businesshours)
## $names
## [1] "country" "period" "QR1" "QR2" "QR3" "QR4"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8
##
## $class
## [1] "data.frame"
# Assign the objects with the character q in their name to a variable 'objects_with_q'
objects_with_q <- ls(pattern = "q")
# Remove the objects (print them instead)
print(objects_with_q)
## [1] "airquality" "amzn_c_freq" "amzn_p_freq"
## [4] "chardonnay_freqs" "freq" "frequency"
## [7] "frequency2" "goog_c_freq" "goog_p_freq"
## [10] "q1" "qdap_clean" "r_sq"
## [13] "sq_ft" "term_frequency" "word_freqs"
# rm(list = objects_with_q)
Chapter 8 - Controlling Functions
Functions and Arguments - R is controlled by functions that are called with values passed to arguments:
Classes - generic functions offer different methods for each class of objects:
Example code includes:
QR1 <- c(36, 34, 37, 35, 33, 32, 35, 31)
QR2 <- c(37, 35, 38, 36, 35, 33, 35, 33)
QR3 <- c(39, 37, 40, NA, 36, 35, 37, 35)
# Correct the code below
summary(data.frame(QR1, QR2, QR3))
## QR1 QR2 QR3
## Min. :31.00 Min. :33.00 Min. :35.0
## 1st Qu.:32.75 1st Qu.:34.50 1st Qu.:35.5
## Median :34.50 Median :35.00 Median :37.0
## Mean :34.12 Mean :35.25 Mean :37.0
## 3rd Qu.:35.25 3rd Qu.:36.25 3rd Qu.:38.0
## Max. :37.00 Max. :38.00 Max. :40.0
## NA's :1
# Verify the class of QR1
class(QR1)
## [1] "numeric"
# Change the class of QR1 to character
QR1_char <- as.character(QR1)
# Verify the class
class(QR1_char)
## [1] "character"
QR4 <- c(36, 34, 37, 35, 34, 32, 36, 32)
# Code block 1
# meanQR1 <- mean(QR1)
# meanQR2 <- mean(QR2)
# meanQR3 <- mean(QR3)
# meanQR4 <- mean(QR4)
max(c(mean(QR1), mean(QR2), mean(QR3), mean(QR4)))
## [1] NA
# Code block 2
# maxQR1 <- max(QR1)
# maxQR2 <- max(QR2)
# maxQR3 <- max(QR3)
# maxQR4 <- max(QR4)
min(c(max(QR1), max(QR2), max(QR3), max(QR4)))
## [1] NA
# Code block 3
# sum_element_wise <- QR1 + QR2 + QR3 + QR4
# log_q <- log(sum_element_wise)
quantile(log(QR1 + QR2 + QR3 + QR4), na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 4.875197 4.905028 4.941642 4.980028 5.023881
Chapter 9 - Data Acquisition
Loading CSV file - from local or from URL:
Loading other data - for example, tab-delimited files (TSV):
Example code includes:
# Load the workshop data and assign them to the variable below
mydata <-read.csv("http://bit.ly/bob_mydata_csv", strip.white=TRUE, na.strings="")
# Print mydata
mydata
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
## 4 2 <NA> 3 1 NA 3
## 5 1 m 4 5 2 4
## 6 2 m 5 4 5 5
## 7 1 m 5 3 4 4
## 8 2 m 4 5 5 5
# Load the library
library(sas7bdat)
# Load the workshop tab file with the right arguments and assign them to the variable 'mydata_tab'
mydata_tab <- read.delim("http://bit.ly/bob_mydata_tab", strip.white=TRUE, na.strings="")
# Load the workshop SAS file and assign them to the variable 'mydata_sas'
mydata_sas <- read.sas7bdat("http://bit.ly/bob_mydata_sas7bdat")
# Print both variables
mydata_tab
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
## 4 2 <NA> 3 1 NA 3
## 5 1 m 4 5 2 4
## 6 2 m 5 4 5 5
## 7 1 m 5 3 4 4
## 8 2 m 4 5 5 5
mydata_sas
## id workshop gender q1 q2 q3 q4
## 1 1 1 f 1 1 5 1
## 2 2 2 f 2 1 4 1
## 3 3 1 f 2 2 4 3
## 4 4 2 . 3 1 NaN 3
## 5 5 1 m 4 5 2 4
## 6 6 2 m 5 4 5 5
## 7 7 1 m 5 3 4 4
## 8 8 2 m 4 5 5 9
# The workshop data as a string
mystring <- "workshop,gender,q1,q2,q3,q4
1,1,f,1,1,5,1
2,2,f,2,1,4,1
3,1,f,2,2,4,3
4,2, ,3,1, ,3
5,1,m,4,5,2,4
6,2,m,5,4,5,5
7,1,m,5,3,4,4
8,2,m,4,5,5,5"
# Read the workshop from the string and assign it to the variable below
mydata <- read.csv(textConnection(mystring), strip.white=TRUE, na.strings="")
# Print mydata
mydata
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
## 4 2 <NA> 3 1 NA 3
## 5 1 m 4 5 2 4
## 6 2 m 5 4 5 5
## 7 1 m 5 3 4 4
## 8 2 m 4 5 5 5
Chapter 10 - Missing Values
Missing value codes include NA (not available) and NaN (not a number):
Dealing with missing values:
Example code includes:
QR3 <- c(39, 37, 40, NA, 36, 35, 37, 35)
# Create a function to calculate the number of missing values.
n.missing <- function(x) {sum(is.na(x))}
# Use n.missing to calculate the number of missing values of QR3.
missing_count <- n.missing(QR3)
missing_count
## [1] 1
# The vector random_vector is preloaded in the workspace.
random_vector <- c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3)
# Set all the 3s in random_vector to missing
random_vector[random_vector == 3] <- NA
# Print the new vector
random_vector
## [1] 1 2 NA 1 2 NA 1 2 NA 1 2 NA 1 2 NA
QR1 <- c(36, 34, 37, 35, 33, 32, 35, NA)
tempData <- c(36, 34, 37, 35, 33, 32, 35, NA, 37, 35, 38, 36, 35, 33, 35, 33, 39, 37, 40, NA, 36, 35, NA, 35, 36, NA, 37, 35, 34, 32, 36, 32)
my_QR_data <- as.data.frame(matrix(tempData, ncol=4))
names(my_QR_data) <- c("QR1", "QR2", "QR3", "QR4")
# Print the vector 'QR1' and inspect it
QR1
## [1] 36 34 37 35 33 32 35 NA
# Print the data frame 'my_QR_data' and inspect it
my_QR_data
## QR1 QR2 QR3 QR4
## 1 36 37 39 36
## 2 34 35 37 NA
## 3 37 38 40 37
## 4 35 36 NA 35
## 5 33 35 36 34
## 6 32 33 35 32
## 7 35 35 NA 36
## 8 NA 33 35 32
# Calculate the mean of 'QR1' by excluding the missing values
mean(QR1, na.rm=TRUE)
## [1] 34.57143
# Remove all rows that contain any missing values from 'my_QR_data'.
na.omit(my_QR_data)
## QR1 QR2 QR3 QR4
## 1 36 37 39 36
## 3 37 38 40 37
## 5 33 35 36 34
## 6 32 33 35 32
Chapter 11 - Selecting variables
Selecting Variables (1) - selecting variables from data frames is different in R and other statistical packages:
Selecting Variables (2) - can also be done through subsetting or indexing:
dplyr Package - simplifying variable selection using library(dplyr):
Example code includes:
tempData <- c(36, 34, 37, 35, 33, 32, 35, 33, 37, 35, 38, 36, 35, 33, 35, 33, 39, 37, 40, 39, 36, 35, 36, 35, 36, 33, 37, 35, 34, 32, 36, 32)
businesshours <- as.data.frame(matrix(tempData, ncol=4))
names(businesshours) <- c("QR1", "QR2", "QR3", "QR4")
# Select the QR1 variable of businesshours
my_QR1_selection <- businesshours$QR1
# Make a summary of the variables QR2 and QR3 of the data frame businesshours.
summary(data.frame(businesshours$QR2, businesshours$QR3))
## businesshours.QR2 businesshours.QR3
## Min. :33.00 Min. :35.00
## 1st Qu.:34.50 1st Qu.:35.75
## Median :35.00 Median :36.50
## Mean :35.25 Mean :37.12
## 3rd Qu.:36.25 3rd Qu.:39.00
## Max. :38.00 Max. :40.00
# Attach the businesshours variable to the temporary work area.
attach(businesshours)
## The following objects are masked _by_ .GlobalEnv:
##
## QR1, QR2, QR3, QR4
# Select the QR1 variable of businesshours and assign it to my_QR1_selection.
my_QR1_selection <- QR1
# Make a summary of the variables QR2 and QR3 of the data frame businesshours.
summary(data.frame(QR2, QR3))
## QR2 QR3
## Min. :33.00 Min. :35.0
## 1st Qu.:34.50 1st Qu.:35.5
## Median :35.00 Median :37.0
## Mean :35.25 Mean :37.0
## 3rd Qu.:36.25 3rd Qu.:38.0
## Max. :38.00 Max. :40.0
## NA's :1
# Detach the businesshours variable of the temporary work area.
detach(businesshours)
# Select the QR1 variable of businesshours using the with function and assign it my_QR1_selection
my_QR1_selection <- with(businesshours, QR1)
# Make a summary of the variables QR2 and QR3 of the data frame businesshours by using the with function.
summary(with(businesshours, data.frame(QR2, QR3)))
## QR2 QR3
## Min. :33.00 Min. :35.00
## 1st Qu.:34.50 1st Qu.:35.75
## Median :35.00 Median :36.50
## Mean :35.25 Mean :37.12
## 3rd Qu.:36.25 3rd Qu.:39.00
## Max. :38.00 Max. :40.00
# Select the QR1 variable of businesshours
my_QR1_selection <- businesshours[, "QR1"]
# Make a summary of the variables QR2 and QR3 of businesshours.
summary(businesshours[, c("QR2", "QR3")])
## QR2 QR3
## Min. :33.00 Min. :35.00
## 1st Qu.:34.50 1st Qu.:35.75
## Median :35.00 Median :36.50
## Mean :35.25 Mean :37.12
## 3rd Qu.:36.25 3rd Qu.:39.00
## Max. :38.00 Max. :40.00
businesshours$country <- c(1, 2, 1, 2, 1, 2, 1, 2)
businesshours$period <- c("bc", "bc", "bc", "bc", "ab", "ab", "ab", "ab")
# t-test of QR4 as function of period and assign it to t_test_1.
t_test_1 <- t.test(QR4 ~ period, data = businesshours)
# A paired t-test comparing QR1 and QR2 and assign it to t_test_2.
t_test_2 <- with(businesshours, t.test(QR1, QR2, paired=TRUE))
# Load the dplyr package into the memory.
library(dplyr)
# Use the select() function to select all variables starting with the variable "period" until "QR3" and all the variables in between them.
select(businesshours, period:QR3)
## period country QR4 QR3
## 1 bc 1 36 39
## 2 bc 2 33 37
## 3 bc 1 37 40
## 4 bc 2 35 39
## 5 ab 1 34 36
## 6 ab 2 32 35
## 7 ab 1 36 36
## 8 ab 2 32 35
# Use the select() function to select all variables that contain "o".
select(businesshours, dplyr::contains("o"))
## country period
## 1 1 bc
## 2 2 bc
## 3 1 bc
## 4 2 bc
## 5 1 ab
## 6 2 ab
## 7 1 ab
## 8 2 ab
# Use the select() function to select all variables that starts_with "Q".
select(businesshours, starts_with("Q"))
## QR1 QR2 QR3 QR4
## 1 36 37 39 36
## 2 34 35 37 33
## 3 37 38 40 37
## 4 35 36 39 35
## 5 33 35 36 34
## 6 32 33 35 32
## 7 35 35 36 36
## 8 33 33 35 32
# Use the `select()` function to select all variables with a numeric range from 2 to 4 and starting with "QR".
select(businesshours, num_range("QR", 2:4))
## QR2 QR3 QR4
## 1 37 39 36
## 2 35 37 33
## 3 38 40 37
## 4 36 39 35
## 5 35 36 34
## 6 33 35 32
## 7 35 36 36
## 8 33 35 32
# Use the `select()` function to select all variables that DO NOT have a numeric range from 2 to 4 and starts with "QR".
select(businesshours, -num_range("QR", 2:4))
## QR1 country period
## 1 36 1 bc
## 2 34 2 bc
## 3 37 1 bc
## 4 35 2 bc
## 5 33 1 ab
## 6 32 2 ab
## 7 35 1 ab
## 8 33 2 ab
# Make a summary of QR1 and QR2 by nesting the select() function.
summary(select(businesshours, QR1, QR2))
## QR1 QR2
## Min. :32.00 Min. :33.00
## 1st Qu.:33.00 1st Qu.:34.50
## Median :34.50 Median :35.00
## Mean :34.38 Mean :35.25
## 3rd Qu.:35.25 3rd Qu.:36.25
## Max. :37.00 Max. :38.00
# Calculate the mean of QR3 with the mean() function.
mean(businesshours$QR3)
## [1] 37.125
Chapter 12 - Selecting Observations
Selecting observations from data frames using two main techniques:
Logic rules and functions:
Example code includes:
# Select the observations of businesshours from the period before the crisis ("bc").
businesshours[businesshours$period == "bc", ]
## QR1 QR2 QR3 QR4 country period
## 1 36 37 39 36 1 bc
## 2 34 35 37 33 2 bc
## 3 37 38 40 37 1 bc
## 4 35 36 39 35 2 bc
# Select the observations of businesshours with an average number of business hours in the first quarter (QR1) bigger than 34 and smaller than or equal to 36 and make a summary of this.
summary(businesshours[businesshours$QR1 > 34 & businesshours$QR1 <= 36, ])
## QR1 QR2 QR3 QR4
## Min. :35.00 Min. :35.0 Min. :36.0 Min. :35.00
## 1st Qu.:35.00 1st Qu.:35.5 1st Qu.:37.5 1st Qu.:35.50
## Median :35.00 Median :36.0 Median :39.0 Median :36.00
## Mean :35.33 Mean :36.0 Mean :38.0 Mean :35.67
## 3rd Qu.:35.50 3rd Qu.:36.5 3rd Qu.:39.0 3rd Qu.:36.00
## Max. :36.00 Max. :37.0 Max. :39.0 Max. :36.00
## country period
## Min. :1.000 Length:3
## 1st Qu.:1.000 Class :character
## Median :1.000 Mode :character
## Mean :1.333
## 3rd Qu.:1.500
## Max. :2.000
# Load the appropriate package
library(dplyr)
# Select the observations of businesshours from the period before the crisis ("bc") using the filter() function from the dplyr package.
filter(businesshours, period == "bc")
## QR1 QR2 QR3 QR4 country period
## 1 36 37 39 36 1 bc
## 2 34 35 37 33 2 bc
## 3 37 38 40 37 1 bc
## 4 35 36 39 35 2 bc
# Select the observations of businesshours with an average number of business hours in the first quarter (QR1) bigger than 34 and smaller than or equal to 36 using the filter() function from the dplyr package and make a summary of this.
summary(filter(businesshours, QR1 > 34 & QR1 <= 36))
## QR1 QR2 QR3 QR4
## Min. :35.00 Min. :35.0 Min. :36.0 Min. :35.00
## 1st Qu.:35.00 1st Qu.:35.5 1st Qu.:37.5 1st Qu.:35.50
## Median :35.00 Median :36.0 Median :39.0 Median :36.00
## Mean :35.33 Mean :36.0 Mean :38.0 Mean :35.67
## 3rd Qu.:35.50 3rd Qu.:36.5 3rd Qu.:39.0 3rd Qu.:36.00
## Max. :36.00 Max. :37.0 Max. :39.0 Max. :36.00
## country period
## Min. :1.000 Length:3
## 1st Qu.:1.000 Class :character
## Median :1.000 Mode :character
## Mean :1.333
## 3rd Qu.:1.500
## Max. :2.000
# Print a logical vector which indicates which elements of period from businesshours are equal to "bc".
businesshours$period == "bc"
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
# Print the indices of period from businesshours which are equal to "ab".
which(businesshours$period == "ab")
## [1] 5 6 7 8
# Find out whether there are subjects of period from businesshours equal to "bc".
any(businesshours$period == "bc")
## [1] TRUE
# Find out how many subjects of period from businesshours are equal to "bc".
sum(businesshours$period == "bc", na.rm=TRUE)
## [1] 4
# Find out whether all the subjects of period from businesshours are equal to "bc".
all(businesshours$period == "bc")
## [1] FALSE
Chapter 13 - Selecting Variables and Observations
Selecting variables and observations - use both row and column portions of subsetting:
Can also use dplyr to combine dplyr::select() and dplyr::filter():
Example code includes:
# Create a character vector with the variables: "period", "QR1" and "QR2" and call it `myVars`.
myVars <- c("period", "QR1", "QR2")
# Create a vector with the observations of the period equal to "bc" and call this vector `myObs`.
myObs <- which(businesshours$period == "bc")
# Select, with the two vectors from above, the variables and observations from `businesshours` by subscripting.
# Save this selection in 'mySubset', print it and make summary of it.
mySubset <- businesshours[myObs, myVars]
mySubset
## period QR1 QR2
## 1 bc 36 37
## 2 bc 34 35
## 3 bc 37 38
## 4 bc 35 36
summary(mySubset)
## period QR1 QR2
## Length:4 Min. :34.00 Min. :35.00
## Class :character 1st Qu.:34.75 1st Qu.:35.75
## Mode :character Median :35.50 Median :36.50
## Mean :35.50 Mean :36.50
## 3rd Qu.:36.25 3rd Qu.:37.25
## Max. :37.00 Max. :38.00
# Use the select() to create mySubset1 with variables period, QR1 and QR2 (in this order) selected from businesshours.
mySubset1 <- select(businesshours, period, QR1, QR2)
# Use the filter() function to select from mySubset1 the observations with the period equal to "bc". Store the result in mySubset2
mySubset2 <- filter(mySubset1, period == "bc")
# Make a summary of mySubset2
summary(mySubset2)
## period QR1 QR2
## Length:4 Min. :34.00 Min. :35.00
## Class :character 1st Qu.:34.75 1st Qu.:35.75
## Mode :character Median :35.50 Median :36.50
## Mean :35.50 Mean :36.50
## 3rd Qu.:36.25 3rd Qu.:37.25
## Max. :37.00 Max. :38.00
Chapter 14 - Transformations
Transformations - making new variables, particularly easy with dplyr::mutate():
Example code includes:
yourdata <- select(businesshours, QR1, QR2, QR3, QR4)
names(yourdata) <- c("A", "B", "C", "D")
# Copy the data frame `yourdata` and assign it to `yourdata2`.
yourdata2 <- yourdata
# Subtract all the observations of the `A` variable from the observations of `D` variable and assign it to `yourdata2$diff`.
yourdata2$diff <- yourdata2$D - yourdata2$A
# Divide all the observations from the `D` variable through the observations of `A` variable and assign it to `yourdata2$ratio`.
yourdata2$ratio <- yourdata2$D / yourdata2$A
# Compute the logarithm of the `D` variable and assign it to `yourdata2$Dlog`.
yourdata2$Dlog <- log(yourdata2$D)
# Calculate the mean of the variables `A`, `B`, `C` and `D` and assign it to `yourdata2$mean`.
yourdata2$mean <- (yourdata2$A + yourdata2$B + yourdata2$C + yourdata2$D) / 4
# Copy the data frame `yourdata` and assign it to `yourdata2`.
yourdata2 <- yourdata
# Subtract all the observations of the `A` variable from the observations of `D` variable and assign it to `yourdata2[,"diff"]`.
yourdata2[,"diff"] <- yourdata2[, "D"] - yourdata2[, "A"]
# Divide all the observations from the `D` variable through the observations of `A` variable and assign it to `yourdata2[,"ratio"]`.
yourdata2[,"ratio"] <- yourdata2[, "D"] / yourdata2[, "A"]
# Compute the logarithm of the `D` variable and assign it to `yourdata2[,"Dlog"]`.
yourdata2[,"Dlog"] <- log(yourdata2[, "D"])
# Calculate the mean of the variables `A`, `B`, `C` and `D` and assign it to `yourdata2[,"mean"]`.
yourdata2[,"mean"] <- (yourdata2[, "A"] + yourdata2[, "B"] + yourdata2[, "C"] + yourdata2[, "D"]) / 4
yourdata2 <- mutate(yourdata, diff = D - A, ratio = D / A, Dlog = log(D), mean = (A + B + C + D) / 4)
x <- 17
y <- 13 / 3
# Calculate `x` to the power 5
x ** 5
## [1] 1419857
# Calculate the exponential function of `x`
exp(x)
## [1] 24154953
# Round the square root of `y` to 2 digits after the comma
round(sqrt(y), 2)
## [1] 2.08
# Calculate the round-off error from the previous instruction
abs(sqrt(y) - round(sqrt(y), 2))
## [1] 0.001665999
Chapter 15 - Graphics
Traditional or base graphics - first graphics package available to R, revolving around the generic plot() function:
Embellishments (1) - customizing base graphics:
Plotting Groups (1) - the plot functions primary weakness in Bob’s opinion:
Scatter plot with regression - using abline() for the line:
The ggplot2 package (1) - “grammar of graphics” by Lee Wilkinson, as implemented by Hadley Wickham:
The ggplot2 package (2) - can add embellishments in many ways:
Embellishments (2) - adding to the ggplot:
Interactive graphics and graphics resources - like JMP or SAS/INSIGHT and the like:
Example code includes:
workshop = factor(c('R', 'SPSS', 'SPSS', 'SPSS', 'Stata', 'SPSS', 'R', 'R', 'SPSS', 'SPSS', 'SPSS', 'SPSS', 'SAS', 'Stata', 'SAS', 'Stata', 'SAS', 'SAS', 'R', 'R', 'SAS', 'SAS', 'R', 'R', 'R', 'Stata', 'SPSS', 'Stata', 'Stata', 'R', 'SAS', 'SAS', 'SAS', 'SPSS', 'R', 'Stata', 'R', 'SAS', 'Stata', 'Stata', 'SPSS', 'SPSS', 'SAS', 'SPSS', 'SAS', 'SPSS', 'SPSS', 'SAS', 'R', 'Stata', 'R', 'SAS', 'SPSS', 'SPSS', 'R', 'SPSS', 'SAS', 'Stata', 'R', 'Stata', 'Stata', 'R', 'SAS', 'R', 'R', 'SPSS', 'SAS', 'SPSS', 'R', 'SPSS', 'R', 'Stata', 'R', 'Stata', 'R', 'SPSS', 'SAS', 'R', 'SAS', 'SPSS', 'Stata', 'SAS', 'R', 'SPSS', 'R', 'Stata', 'SAS', 'SAS', 'R', 'Stata', 'R', 'Stata', 'R', 'R', 'R', 'SPSS', 'SAS', 'R', 'SAS', 'SPSS'), levels=c("R", "SAS", "SPSS", "Stata"))
gender = factor(c('Female', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male'), levels=c("Female", "Male"))
# Plot the workshop factor on the x-axis and the gender factor on the y-axis
plot(workshop, gender)
# Plot the gender factor on the x-axis and the workshop factor on the y-axis
plot(gender, workshop)
pretest = c(72, 70, 74, 80, 75, 72, 72, 83, 73, 79, 82, 77, 73, 75, 73, 81, 74, 83, 72, 72, 76, 75, 72, 67, 75, 71, 80, 70, 81, 72, 76, 79, 72, 78, 75, 69, 74, 83, 74, 71, 75, 77, 80, 81, 76, 81, 71, 63, 73, 72, 63, 78, 71, 74, 67, 78, 84, 71, 74, 85, 80, 85, 75, 74, 72, 82, 69, 77, 75, 86, 72, 67, 76, 75, 71, 76, 74, 72, 78, 73, 66, 62, 72, 82, 79, 81, 80, 77, 67, 76, 83, 58, 71, 81, 78, 79, 77, 78, 75, 70)
posttest = c(80, 75, 78, 82, 81, 77, 88, 92, 76, 84, 83, 81, 76, 74, 77, 84, 82, 86, 86, 84, 77, 81, 84, 79, 89, 76, 90, 75, 82, 86, 77, 78, 75, 81, 85, 79, 91, 90, 75, 76, 81, 82, 86, 83, 77, 90, 77, 67, 86, 83, 76, 87, 80, 78, 81, 81, 85, 72, 86, 95, 85, 95, 81, 88, 80, 84, 68, 78, 84, 90, 88, 75, 89, 78, 83, 83, 77, 87, 86, 75, 69, 71, 79, 88, 92, 96, 77, 79, 81, 86, 98, 59, 90, 88, 87, 84, 89, 92, 82, 80)
# Plot the workshop factor against the pretest variable
plot(workshop, pretest)
# Plot the pretest variable against the workshop factor
plot(pretest, workshop)
# Plot the posttest variable against the pretest variable
plot(posttest, pretest)
# Make a histogram of the pretest variable and add ticks to it
hist(pretest)
rug(pretest)
# Plot the posttest variable against the pretest variable and add all the embellishments
plot(posttest, pretest, pch=3, cex=0.5, main="Embellished plot", xlab="X values", ylab="Y values")
grid()
# Plot the pretest variable against the posttest variable and include a regression analysis manually
plot(pretest, posttest)
abline(18.78, 0.845)
# Plot the pretest variable against the posttest variable
plot(pretest, posttest)
# Create a regression model
mydata100 <- data.frame(workshop=workshop, gender=gender, pretest=pretest, posttest=posttest)
myModel <- lm(posttest ~ pretest, data = mydata100)
# Plot a regression analysis automatically
abline(coefficients(myModel))
# Plot the posttest variable against the pretest variable with the right embellishments
plot(posttest, pretest, pch=3, cex=2, main="Combination Plot", xlab="X: posttest", ylab="Y: pretest")
grid()
# Create a regression model and plot it
myModel <- lm(pretest ~ posttest)
abline(coefficients(myModel))
# Plot the workshop factor as a bar chart
library(ggplot2)
ggplot(mydata100, aes(workshop)) + geom_bar()
# Plot a bar chart of the workshop factor, filled with stacked gender information
ggplot(mydata100, aes(workshop, fill=gender)) + geom_bar(position="stack")
# Plot a bar chart of the gender factor in grey scale, filled with stacked workshop information
ggplot(mydata100, aes(gender, fill=workshop)) + geom_bar(position="stack") + scale_fill_grey()
# Plot a bar chart of the workshop factor in grey scale, filled with dodged gender information.
ggplot(mydata100, aes(workshop, fill=gender)) + geom_bar(position="dodge") + scale_fill_grey()
# Plot a grouped bar chart of the workshop factor, with the gender factor specifying the number of rows
ggplot(mydata100, aes(workshop)) + geom_bar() + facet_grid(gender ~ .)
# Make a grouped box plot of the workshop factor against the pretest variable, with the gender factor specifying the number of columns, superimposed by a scatter plot of the same data
ggplot(mydata100, aes(x=workshop, y=pretest)) + geom_boxplot() + facet_grid(. ~ gender) + geom_point()
# Make a scatter plot of the pretest variable against the posttest variable, specifying the shape of the points by the gender factor and setting their size to 5. Superimpose this plot with a regression analysis of the same data, specifying the line type again by the gender factor
ggplot(mydata100, aes(x=pretest, y=posttest, shape=gender, linetype=gender)) + geom_point(size=5) + geom_smooth(method="lm")
# Make a scatter plot of the pretest variable against the posttest variable, set the title of the plot to "Plot of Test Scores" and the x- and y-label to "Before Workshop" and "After Workshop", respectively
ggplot(mydata100, aes(x=pretest, y=posttest)) + geom_point() + labs(title="Plot of Test Scores", x="Before Workshop", y="After Workshop")
# Create a theme that starts from the theme theme_bw(), doubles the size of the title, and sets the major and minor grid lines (x and y) to white
my_white <- theme_bw() + theme(panel.grid.major.x = element_blank()) + theme(panel.grid.minor.x = element_blank()) + theme(panel.grid.major.y = element_blank()) + theme(panel.grid.minor.y = element_blank()) + theme(plot.title = element_text(size = rel(2)))
# Make a scatter plot of the pretest variable against the posttest variable, set the title of the plot to "Plot of Test Scores" and the x- and y-label to "Before Workshop" and "After Workshop", respectively, and add your own theme
ggplot(mydata100, aes(x=pretest, y=posttest)) + geom_point() + labs(title="Plot of Test Scores", x="Before Workshop", y="After Workshop") + my_white
# Create a theme that starts from the theme theme_bw(), doubles the size of the title, and sets the major and minor grid lines (x and y) to white
my_white <- theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank(), plot.title = element_text(size = rel(2)))
# Make a scatter plot of the pretest variable against the posttest variable, set the title of the plot to "Plot of Test Scores" and the x- and y-label to "Before Workshop" and "After Workshop", respectively, and add your own theme
ggplot(mydata100, aes(x=pretest, y=posttest)) + geom_point() + labs(title="Plot of Test Scores", x="Before Workshop", y="After Workshop") + my_white
library(RColorBrewer)
# List the color palettes with four colors of the RColorBrewer package
display.brewer.all(n = 4)
# Plot a bar chart of the workshop factor, filled with stacked gender information, colored according to the palette Set2
ggplot(mydata100, aes(workshop, fill=gender)) + geom_bar(position="stack") + scale_fill_brewer(palette = "Set2")
# Create your theme
my_white <- theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank(), plot.title = element_text(size = rel(3)))
# Plot!
ggplot(mydata100, aes(x=pretest, y=posttest, shape=gender, linetype=gender)) + geom_point(size=2) + facet_grid(workshop ~ gender) + labs(title="Combination Plot", x="Before Workshop", y="After Workshop") + geom_smooth(method="lm") + my_white
Chapter 16 - Writing Functions
Writing Functions - similar to macros in other languages:
Applying Functions by Group; Anonymous Functions:
Debugging Tips - general R programming tips, but especially useful for newly written functions:
Example code includes:
# Write a function mymean that returns the mean of a vector, removing the missing values and without naming the result
mymean <- function(x) { mean(x, na.rm=TRUE) }
# Apply mymean on `pretest`
mymean(pretest)
## [1] 74.97
# Write a function mystats that returns the mean, the standard deviation, the median, the maximum and the minimum of a vector, in that order, removing the missing values.
mystats <- function(x) {
c(mean=mean(x, na.rm=TRUE), sd=sd(x, na.rm=TRUE), median=median(x, na.rm=TRUE),
max=max(x, na.rm=TRUE), min=min(x, na.rm=TRUE)
)
}
# Apply mystats on pretest
mystats(pretest)
## mean sd median max min
## 74.970000 5.296187 75.000000 86.000000 58.000000
# Calculate the mean, standard deviation, median, maximum and minimum by using the pre-loaded function of the pretest variable that is grouped by gender
by(pretest, gender, mystats)
## gender: Female
## mean sd median max min
## 74.617021 5.289667 74.000000 86.000000 62.000000
## --------------------------------------------------------
## gender: Male
## mean sd median max min
## 75.283019 5.332691 75.000000 85.000000 58.000000
# Calculate the mean and the minimum (in that order and without names) in
# an anonymous function of the pretest variable that is grouped by gender.
by(pretest, gender, function(x) { c(mean(x), min(x)) } )
## gender: Female
## [1] 74.61702 62.00000
## --------------------------------------------------------
## gender: Male
## [1] 75.28302 58.00000
# Debug the code
by(pretest, gender, function(x){c(mean(na.rm = TRUE, x), sd(x, TRUE), median(x = x, na.rm = TRUE)) } )
## gender: Female
## [1] 74.617021 5.289667 74.000000
## --------------------------------------------------------
## gender: Male
## [1] 75.283019 5.332691 75.000000
Chapter 17 - Basic Statistics
The R’s built-in Means functions:
Getting summary statistics - default digits can be over-ridden in the options:
R’s description capabilities:
R’s tabulation possibilities - the built-in cross-tab is table() with 2+ variables called:
Example code (not run due to lack of dataset) includes:
# DO NOT HAVE THIS DATASET
talent_scores <- matrix(data=-1L, nrow=505, ncol=9)
talent_scores[, 1] <- c(87, 76, 90, 82, 94, 86, 76, 99, 93, 79, 85, 82, 87, 81, 99, 76, 91, 92, 81, 92, 86, 86, 67, 92, 92, 78, 83, 97, 76, 99, 96, 83, 83, 82, 89, 104, 84, 88, 85, 86, 90, 94, 99, 71, 89, 106, 71, 79, 85, 82, 69, 89, 47, 74, 59, 69, 67, 88, 99, 88, 82, 72, 100, 80, 75, 84, 100, 78, 94, 98, 74, 95, 86, 78, 64, 86, 94, 74, 73, 97, 78, 93, 78, 75, 71, 102, 98, 56, 94, 100, 73, 80, 79, 100, 98, 75, 83, 87, 82, 75, 81, 92, 78, 76, 84, 93, 98, 70, 85, 77, 1, 67, 62, 73, 79, 76, 91, 78, 87, 90, 67, 93, 84, 77, 106, 86, 91, 89, 94, 93, 69, 78, 85, 77, 89, 92, 79, 101, 95, 63, 80, 96, 86, 84, 63, 76, 90, 81, 87, 86, 83, 90, 65, 71, 64, 77, 85, 85, 65, 81, 92, 100, 88, 72, 95, 63, 86, 101, 79, 84, 91, 75, 79, 81, 82, 70, 92, 66, 81, 85, 83, 94, 98, 77, 103, 86, 79, 96, 52, 83, 74, 83, 92, 82, 95, 68, 81, 82, 87, 96, 43, 97, 73, 81, 96, 74, 75, 81, 91, 69, 91, 87, 61, 72, 90, 95, 97, 83, 90, 91, 80, 73, 89, 85, 80, 81, 80, 57, 79, 83, 80, 40, 99, 94, 94, 94, 92, 108, 79, 93, 66, 89, 0, 86, 95, 84, 101, 87, 93, 78, 98, 96, 95, 81, 88, 84, 88, 102, 84, 80, 85, 81, 91, 93, 83, 104, 98, 99, 86, 91, 107, 94, 87, 103, 92, 99, 104, 79, 72, 63, 94, 84, 96, 82, 80, 96, 82, 74, 96, 102, 96, 89, 93, 94, 84, 93, 81, 95, 66, 103, 102, 86, 91, 103, 79, 111, 97, 88, 87, 77, 94, 83, 71, 99, 98, 81, 109, 107, 95, 93, 110, 84, 80, 108, 79, 100, 81, 79, 96, 88, 75, 87, 85, 82, 78, 77, 102, 87, 100, 82, 82, 80, 96, 76, 81, 84, 93, 74, 99, 88, 75, 86, 81, 100, 99, 96, 84, 104, 87, 81, 86, 98, 99, 91, 92, 87, 99, 98, 77, 76, 88, 86, 86, 80, 90, 90, 81, 90, 90, 90, 89, 99, 75, 84, 92, 80, 66, 90, 105, 75, 99, 100, 81, 87, 94, 81, 80, 95, 88, 93, 79, 88, 105, 91, 94, 67, 81, 107, 80, 74, 88, 90, 90, 93, 94, 85, 99, 94, 93, 104, 84, 86, 109, 72, 102, 89, 90, 87, 91, 88, 97, 80, 88, 83, 100, 103, 58, 81, 92, 90, 98, 90, 78, 83, 75, 93, 89, 73, 84, 107, 85, 94, 80, 91, 91, 86, 107, 83, 102, 94, 89, 86, 100, 96, 93, 95, 98, 105, 93, 88, 99, 70, 80, 78, 84, 90, 99, 90, 98, 85, 88, 97, 95, 95, 104, 90, 76, 79, 85, 81, 96, 92, 99, 88, 98, 84, 82, 97, 100, 96, 72, 63, 92, 87, 105)
talent_scores[, 2] <- c(39, 15, 28, 47, 40, 21, 33, 46, 42, 38, 42, 32, 39, 43, 41, 34, 41, 38, 32, 41, 32, 43, 24, 43, 43, 25, 36, 45, 27, 39, 44, 36, 33, 27, 43, 47, 36, 42, 41, 44, 43, 40, 44, 23, 33, 48, 34, 23, 39, 36, 23, 25, 24, 22, 26, 5, 29, 45, 38, 44, 44, 43, 46, 34, 34, 40, 36, 20, 44, 33, 28, 39, 39, 44, 28, 37, 45, 20, 32, 38, 28, 39, 27, 30, 24, 44, 44, 16, 47, 42, 25, 30, 24, 43, 44, 29, 42, 41, 28, 21, 27, 35, 31, 39, 36, 46, 43, 41, 30, 39, 44, 15, 14, 34, 20, 25, 26, 22, 39, 36, 19, 45, 38, 38, 44, 36, 42, 27, 34, 40, 26, 23, 34, 27, 40, 46, 31, 47, 45, 19, 28, 45, 33, 45, 22, 23, 35, 43, 36, 24, 36, 35, 17, 17, 12, 30, 35, 29, 16, 23, 39, 46, 39, 16, 36, 14, 28, 43, 31, 19, 28, 31, 30, 30, 40, 12, 46, 14, 39, 25, 31, 34, 35, 40, 47, 19, 35, 46, 40, 31, 22, 31, 42, 31, 44, 4, 41, 25, 32, 42, 18, 45, 36, 38, 43, 40, 28, 30, 39, 25, 36, 35, 20, 24, 43, 43, 45, 36, 35, 28, 34, 44, 40, 33, 34, 41, 29, 21, 16, 41, 23, 7, 46, 40, 39, 39, 37, 46, 15, 46, 11, 42, 42, 23, 40, 37, 43, 28, 33, 23, 45, 42, 41, 43, 40, 24, 17, 45, 25, 27, 24, 32, 39, 32, 32, 43, 43, 39, 26, 43, 42, 41, 28, 45, 30, 42, 42, 28, 14, 15, 32, 31, 44, 31, 21, 45, 32, 27, 37, 44, 44, 32, 44, 38, 42, 33, 28, 42, 17, 37, 42, 30, 42, 33, 24, 46, 37, 35, 36, 35, 40, 37, 16, 41, 43, 24, 45, 42, 31, 37, 47, 27, 26, 39, 38, 40, 27, 28, 40, 34, 37, 41, 30, 22, 42, 28, 48, 24, 38, 18, 24, 36, 45, 40, 16, 36, 44, 20, 46, 37, 29, 35, 24, 46, 47, 0, 30, 41, 40, 31, 26, 39, 38, 36, 37, 31, 44, 33, 25, 26, 21, 36, 35, 24, 21, 33, 31, 32, 38, 39, 37, 44, 27, 44, 36, 21, 36, 36, 45, 29, 29, 42, 20, 48, 42, 19, 34, 45, 27, 25, 37, 18, 46, 25, 39, 14, 38, 38, 23, 17, 26, 33, 30, 37, 29, 22, 39, 29, 44, 43, 27, 41, 40, 33, 30, 28, 28, 41, 24, 19, 43, 27, 42, 16, 31, 46, 14, 34, 41, 19, 40, 23, 36, 23, 29, 41, 34, 20, 37, 44, 37, 37, 29, 28, 37, 38, 43, 22, 44, 45, 43, 27, 48, 33, 23, 34, 39, 41, 36, 37, 41, 18, 34, 34, 29, 30, 46, 23, 45, 32, 39, 37, 38, 28, 44, 40, 39, 30, 26, 44, 30, 32, 40, 40, 43, 27, 23, 45, 48, 41, 20, 21, 41, 22, 45)
talent_scores[, 3] <- c(9, 7, 8, 13, 10, 10, 9, 18, 10, 14, 12, 10, 16, 8, 13, 7, 11, 11, 5, 17, 10, 5, 9, 12, 16, 10, 14, 10, 8, 9, 18, 13, 7, 10, 18, 8, 18, 13, 15, 14, 9, 13, 17, 1, 7, 18, 7, 8, 14, 12, 11, 7, 7, 5, 6, 6, 9, 14, 12, 11, 7, 16, 13, 10, 10, 19, 9, 5, 11, 14, 8, 13, 14, 11, 1, 8, 11, 5, 9, 10, 8, 13, 7, 14, 8, 18, 13, 5, 15, 13, 14, 6, 11, 16, 15, 3, 10, 14, 12, 11, 12, 11, 7, 14, 16, 13, 14, 9, 16, 10, 15, 13, 5, 14, 8, 8, 9, 12, 13, 13, 7, 15, 10, 9, 18, 10, 11, 8, 12, 16, 9, 7, 4, 8, 8, 15, 7, 15, 12, 4, 9, 18, 6, 14, 8, 3, 9, 12, 16, 6, 11, 15, 5, 9, 4, 8, 7, 6, 6, 7, 16, 6, 11, 9, 9, 7, 15, 8, 7, 7, 11, 7, 7, 5, 16, 11, 14, 5, 5, 9, 9, 15, 13, 9, 17, 13, 9, 6, 9, 10, 7, 14, 13, 10, 14, 5, 13, 8, 12, 16, 1, 10, 10, 10, 10, 10, 3, 2, 15, 8, 13, 8, 9, 9, 18, 16, 11, 11, 9, 7, 10, 9, 11, 8, 13, 15, 8, 7, 4, 12, 8, 3, 16, 14, 6, 9, 14, 10, 13, 9, 6, 8, 8, 4, 10, 6, 13, 6, 14, 9, 8, 13, 8, 9, 13, 11, 3, 13, 12, 5, 5, 5, 11, 9, 3, 13, 15, 10, 2, 12, 17, 15, 6, 15, 9, 10, 11, 8, 12, 4, 11, 4, 13, 12, 4, 13, 8, 9, 6, 9, 11, 7, 10, 14, 9, 8, 4, 13, 4, 8, 9, 9, 11, 5, 11, 15, 11, 12, 11, 8, 2, 12, 3, 10, 16, 7, 17, 10, 7, 12, 19, 8, 8, 7, 8, 13, 11, 9, 9, 9, 4, 8, 7, 6, 10, 15, 19, 4, 15, 4, 7, 11, 13, 8, 5, 6, 9, 9, 8, 13, 5, 7, 7, 10, 18, 0, 11, 15, 14, 10, 8, 5, 9, 9, 7, 4, 15, 10, 10, 5, 4, 8, 16, 6, 10, 9, 7, 8, 8, 14, 14, 11, 4, 9, 12, 7, 10, 7, 15, 8, 8, 10, 3, 13, 10, 7, 9, 13, 5, 6, 8, 8, 16, 10, 10, 6, 8, 6, 5, 3, 6, 10, 4, 11, 9, 7, 13, 12, 9, 15, 2, 11, 16, 7, 9, 9, 2, 7, 8, 9, 13, 7, 17, 7, 14, 16, 2, 8, 11, 4, 12, 5, 11, 4, 7, 13, 11, 3, 13, 13, 12, 10, 13, 6, 7, 6, 11, 1, 11, 10, 17, 2, 11, 9, 10, 2, 12, 15, 13, 11, 7, 9, 16, 12, 11, 10, 16, 5, 15, 11, 11, 11, 8, 13, 16, 13, 12, 6, 7, 12, 14, 5, 13, 7, 16, 1, 4, 15, 15, 11, 8, 7, 11, 5, 14)
talent_scores[, 4] <- c(12, 10, 12, 14, 15, 14, 12, 20, 17, 18, 17, 18, 17, 10, 10, 9, 12, 14, 14, 17, 12, 11, 9, 15, 19, 15, 16, 16, 10, 17, 15, 11, 15, 12, 17, 13, 16, 12, 19, 20, 17, 15, 20, 5, 16, 13, 15, 8, 12, 16, 16, 11, 8, 14, 16, 9, 13, 12, 16, 15, 14, 16, 16, 17, 12, 15, 14, 8, 10, 11, 15, 15, 18, 18, 10, 13, 15, 9, 15, 10, 11, 12, 12, 13, 16, 15, 9, 11, 14, 14, 10, 11, 19, 17, 17, 18, 17, 16, 15, 14, 16, 13, 12, 18, 12, 15, 18, 13, 12, 9, 18, 18, 9, 10, 10, 13, 18, 12, 11, 13, 15, 15, 16, 12, 20, 16, 19, 13, 13, 16, 15, 18, 13, 14, 7, 19, 13, 16, 13, 7, 10, 20, 13, 20, 10, 8, 9, 14, 20, 9, 14, 20, 9, 13, 6, 18, 11, 11, 4, 13, 15, 9, 16, 17, 11, 5, 11, 17, 10, 12, 12, 14, 14, 11, 15, 8, 14, 9, 18, 12, 10, 16, 18, 8, 19, 12, 11, 13, 14, 14, 14, 13, 19, 15, 17, 19, 13, 14, 13, 10, 4, 18, 19, 12, 19, 8, 8, 8, 17, 7, 14, 8, 10, 14, 15, 16, 18, 15, 14, 5, 11, 14, 16, 14, 17, 16, 7, 11, 14, 12, 19, 6, 17, 17, 4, 9, 18, 11, 5, 5, 7, 9, 14, 4, 9, 6, 15, 6, 9, 10, 9, 13, 13, 7, 9, 7, 2, 11, 10, 6, 8, 7, 9, 4, 4, 17, 18, 12, 4, 8, 13, 10, 10, 14, 12, 15, 13, 10, 4, 8, 11, 5, 14, 7, 11, 15, 8, 7, 16, 11, 9, 11, 14, 10, 7, 10, 4, 12, 6, 9, 11, 8, 7, 9, 8, 14, 11, 13, 12, 6, 11, 7, 2, 14, 13, 12, 14, 7, 4, 9, 17, 8, 6, 6, 6, 8, 8, 9, 9, 8, 11, 12, 7, 4, 12, 10, 11, 5, 10, 4, 11, 4, 6, 8, 8, 5, 12, 11, 15, 11, 2, 6, 10, 12, 14, 0, 13, 9, 10, 11, 8, 12, 8, 10, 14, 2, 10, 13, 7, 7, 6, 8, 6, 7, 6, 10, 5, 7, 8, 12, 12, 15, 8, 7, 10, 7, 7, 8, 16, 5, 10, 9, 6, 12, 12, 7, 7, 11, 7, 9, 9, 2, 11, 7, 9, 7, 10, 6, 6, 4, 4, 10, 3, 13, 6, 7, 14, 12, 9, 10, 4, 9, 8, 8, 8, 8, 6, 6, 7, 5, 12, 6, 18, 2, 8, 17, 8, 9, 3, 9, 9, 6, 12, 4, 5, 15, 11, 10, 11, 13, 12, 10, 7, 5, 6, 10, 10, 1, 8, 10, 7, 4, 16, 7, 5, 13, 9, 11, 13, 9, 9, 13, 6, 8, 12, 7, 8, 11, 13, 8, 7, 10, 10, 6, 18, 16, 11, 11, 7, 10, 8, 9, 11, 10, 12, 8, 5, 15, 15, 11, 7, 6, 13, 6, 11)
talent_scores[, 5] <- c(9, 10, 9, 12, 12, 11, 9, 15, 13, 11, 12, 8, 11, 11, 8, 5, 11, 11, 13, 11, 7, 11, 7, 12, 12, 7, 12, 11, 13, 11, 10, 8, 11, 11, 10, 14, 8, 14, 12, 12, 13, 6, 10, 9, 11, 12, 11, 8, 12, 11, 12, 8, 4, 8, 8, 8, 9, 10, 11, 12, 8, 9, 9, 11, 10, 9, 9, 8, 10, 9, 8, 11, 15, 10, 3, 7, 14, 9, 10, 12, 6, 12, 9, 11, 12, 14, 11, 5, 10, 11, 8, 10, 6, 14, 9, 12, 11, 13, 7, 10, 7, 11, 2, 10, 12, 13, 11, 12, 9, 9, 12, 9, 5, 5, 9, 7, 12, 9, 10, 11, 7, 10, 12, 6, 14, 11, 11, 10, 10, 12, 9, 11, 10, 9, 10, 12, 13, 13, 12, 8, 9, 10, 11, 15, 9, 7, 7, 10, 8, 11, 12, 10, 2, 9, 4, 6, 10, 2, 4, 9, 11, 11, 10, 7, 4, 5, 8, 13, 12, 5, 10, 6, 12, 9, 8, 2, 10, 2, 12, 9, 11, 15, 9, 8, 15, 10, 9, 12, 7, 10, 10, 9, 13, 7, 8, 12, 11, 11, 10, 8, 1, 12, 10, 7, 12, 7, 7, 7, 11, 4, 9, 8, 6, 10, 10, 12, 14, 14, 13, 9, 8, 7, 13, 11, 10, 11, 11, 9, 9, 11, 12, 2, 11, 14, 9, 6, 11, 14, 5, 11, 5, 12, 12, 8, 11, 9, 12, 6, 9, 12, 9, 13, 12, 9, 11, 9, 5, 11, 7, 6, 11, 9, 7, 15, 3, 14, 13, 10, 14, 9, 12, 14, 10, 11, 9, 13, 9, 12, 9, 4, 8, 9, 12, 10, 7, 14, 8, 1, 11, 12, 12, 8, 9, 9, 8, 9, 8, 9, 4, 12, 10, 6, 8, 10, 7, 13, 9, 7, 15, 10, 8, 11, 9, 13, 11, 13, 13, 11, 10, 11, 14, 10, 4, 10, 12, 7, 14, 10, 8, 9, 0, 7, 8, 6, 9, 13, 13, 4, 10, 10, 12, 5, 9, 8, 8, 11, 10, 6, 13, 10, 4, 7, 11, 11, 14, 0, 8, 11, 10, 12, 8, 10, 13, 9, 9, 9, 13, 10, 6, 10, 7, 10, 3, 10, 9, 9, 10, 10, 9, 10, 13, 11, 9, 9, 8, 10, 13, 10, 15, 11, 13, 14, 8, 13, 12, 7, 12, 6, 12, 9, 8, 8, 11, 7, 10, 7, 6, 9, 6, 4, 4, 10, 7, 10, 7, 8, 11, 10, 12, 12, 6, 7, 12, 13, 9, 7, 5, 8, 11, 6, 12, 8, 11, 5, 13, 14, 3, 10, 9, 3, 11, 3, 7, 5, 6, 9, 7, 7, 8, 13, 13, 11, 6, 10, 10, 10, 14, 8, 7, 10, 10, 11, 12, 9, 7, 11, 9, 12, 12, 12, 11, 8, 3, 10, 11, 10, 12, 9, 11, 10, 7, 11, 9, 10, 14, 12, 11, 12, 7, 12, 6, 10, 9, 11, 12, 8, 7, 11, 13, 10, 9, 7, 11, 8, 11)
talent_scores[, 6] <- c(20, 15, 26, 29, 32, 21, 25, 51, 31, 39, 32, 31, 34, 34, 34, 16, 32, 35, 30, 27, 15, 42, 16, 37, 39, 23, 39, 49, 17, 44, 43, 10, 27, 19, 42, 47, 18, 28, 41, 37, 32, 23, 32, 15, 24, 37, 14, 9, 36, 39, 13, 13, 11, 16, 22, 14, 13, 33, 27, 27, 27, 33, 50, 24, 13, 32, 41, 21, 44, 36, 19, 35, 39, 39, 23, 15, 46, 15, 26, 25, 20, 43, 24, 11, 17, 49, 45, 12, 36, 44, 27, 26, 24, 49, 31, 25, 17, 40, 27, 12, 19, 27, 16, 36, 27, 47, 36, 29, 15, 23, 0, 19, 13, 20, 24, 25, 39, 19, 30, 22, 12, 44, 38, 19, 48, 16, 31, 26, 23, 39, 25, 26, 31, 20, 24, 36, 20, 39, 28, 15, 13, 52, 29, 27, 20, 12, 20, 31, 29, 12, 24, 36, 16, 15, 11, 20, 22, 18, 13, 21, 34, 37, 38, 10, 24, 20, 19, 39, 15, 17, 28, 20, 17, 17, 25, 11, 38, 9, 20, 15, 31, 37, 25, 20, 46, 18, 18, 24, 12, 22, 22, 20, 42, 25, 43, 30, 16, 19, 29, 31, 8, 36, 30, 20, 37, 20, 15, 23, 26, 21, 24, 13, 16, 27, 36, 40, 48, 12, 21, 15, 25, 20, 39, 23, 18, 22, 10, 15, 15, 24, 29, 12, 45, 28, 15, 24, 29, 43, 27, 28, 12, 32, 0, 15, 35, 11, 40, 7, 27, 13, 20, 45, 22, 15, 28, 11, 12, 37, 18, 13, 19, 16, 34, 24, 13, 24, 46, 25, 15, 29, 39, 41, 20, 31, 19, 39, 33, 27, 7, 9, 22, 27, 38, 23, 18, 37, 5, 13, 39, 27, 30, 30, 25, 21, 16, 17, 18, 35, 14, 24, 34, 21, 17, 19, 16, 47, 31, 11, 26, 18, 21, 17, 7, 25, 29, 19, 33, 42, 16, 44, 51, 26, 15, 25, 19, 23, 17, 25, 17, 21, 11, 16, 20, 13, 20, 25, 41, 19, 22, 14, 29, 19, 22, 20, 11, 10, 28, 16, 41, 39, 10, 22, 14, 23, 23, 36, 27, 43, 20, 19, 18, 23, 24, 16, 23, 23, 25, 26, 13, 20, 11, 20, 28, 19, 23, 20, 16, 28, 25, 25, 16, 37, 20, 31, 31, 16, 15, 11, 39, 10, 23, 21, 17, 37, 20, 11, 20, 46, 16, 25, 12, 16, 34, 18, 29, 17, 26, 33, 10, 6, 12, 17, 18, 22, 25, 18, 24, 20, 19, 27, 11, 27, 31, 21, 20, 19, 20, 31, 16, 19, 21, 25, 36, 13, 25, 42, 20, 16, 21, 17, 18, 11, 22, 13, 16, 26, 23, 14, 26, 43, 20, 39, 18, 24, 22, 24, 34, 14, 23, 41, 29, 17, 46, 25, 15, 18, 31, 36, 24, 17, 25, 26, 19, 17, 19, 10, 18, 11, 24, 19, 15, 24, 20, 17, 34, 24, 22, 17, 12, 25, 22, 21, 23, 31, 26, 8, 8, 39, 27, 23, 17, 5, 18, 18, 26)
talent_scores[, 7] <- c(10, 4, 9, 4, 11, 6, 11, 9, 6, 9, 6, 1, 4, 8, 7, 8, 2, 7, 4, 8, 8, 2, 5, 11, 2, 8, 3, 9, 11, 9, 8, 3, 3, 8, 12, 10, 9, 9, 5, 5, 5, 7, 11, 7, 7, 5, 6, 6, 8, 5, 6, 9, 6, 10, 3, 7, 3, 2, 1, 2, 7, 12, 8, 9, 8, 3, 8, 7, 8, 10, 5, 5, 11, 5, 6, 10, 11, 3, 9, 2, 12, 6, 10, 10, 6, 10, 11, 5, 3, 7, 10, 11, 10, 5, 1, 9, 4, 4, 9, 8, 8, 1, 12, 7, 10, 3, 12, 4, 5, 3, 9, 6, 11, 5, 9, 4, 8, 2, 6, 8, 3, 7, 10, 10, 5, 3, 9, 12, 4, 5, 3, 10, 10, 9, 11, 10, 8, 4, 7, 5, 7, 5, 6, 4, 8, 12, 9, 9, 9, 10, 5, 10, 6, 10, 3, 2, 11, 8, 11, 4, 9, 7, 11, 6, 12, 7, 9, 11, 3, 6, 11, 7, 10, 2, 4, 10, 9, 3, 5, 8, 10, 7, 6, 8, 8, 10, 10, 10, 10, 10, 6, 2, 4, 12, 10, 3, 4, 7, 4, 10, 4, 5, 7, 8, 9, 7, 4, 4, 2, 1, 2, 6, 0, 2, 7, 10, 9, 8, 9, 9, 3, 1, 8, 5, 7, 11, 7, 2, 6, 10, 5, 2, 9, 6, 6, 10, 4, 6, 8, 11, 9, 5, 6, 4, 7, 6, 11, 9, 10, 6, 7, 10, 12, 7, 7, 8, 7, 10, 2, 10, 10, 11, 7, 11, 9, 10, 12, 8, 12, 3, 6, 9, 5, 5, 7, 10, 8, 8, 10, 7, 8, 10, 7, 4, 7, 10, 11, 10, 5, 9, 12, 10, 11, 9, 4, 8, 8, 8, 10, 11, 6, 7, 6, 8, 7, 3, 9, 10, 7, 7, 5, 10, 9, 6, 7, 5, 2, 6, 10, 8, 3, 7, 3, 8, 2, 4, 11, 4, 5, 11, 5, 9, 9, 11, 8, 8, 6, 8, 7, 10, 10, 11, 11, 3, 8, 4, 3, 8, 8, 5, 9, 6, 9, 7, 6, 9, 11, 10, 7, 9, 8, 9, 7, 4, 3, 6, 7, 5, 3, 6, 11, 4, 1, 10, 2, 11, 9, 9, 4, 10, 10, 7, 3, 4, 0, 10, 8, 0, 1, 6, 6, 10, 11, 7, 7, 2, 7, 10, 11, 7, 8, 9, 7, 4, 11, 7, 9, 10, 7, 8, 7, 8, 10, 11, 7, 5, 8, 6, 5, 10, 6, 9, 5, 5, 5, 4, 10, 3, 5, 9, 7, 10, 9, 2, 11, 8, 6, 10, 11, 10, 7, 8, 10, 11, 7, 5, 10, 7, 8, 10, 9, 10, 11, 10, 7, 11, 9, 7, 7, 10, 12, 7, 10, 8, 10, 7, 9, 7, 2, 1, 2, 5, 10, 10, 11, 7, 5, 10, 10, 9, 11, 3, 7, 6, 5, 7, 9, 11, 5, 5, 3, 9, 12, 1, 10, 1, 5, 10, 8, 1, 7, 8, 7, 8, 1)
talent_scores[, 8] <- c(20, 15, 8, 28, 26, 8, 16, 36, 33, 30, 27, 20, 23, 28, 33, 9, 17, 18, 13, 21, 25, 28, 27, 26, 31, 17, 24, 34, 24, 28, 27, 24, 23, 18, 34, 34, 26, 18, 24, 19, 14, 34, 30, 6, 18, 11, 6, 19, 18, 40, 23, 21, 13, 11, 12, 15, 14, 24, 16, 18, 31, 36, 36, 20, 19, 29, 36, 2, 31, 31, 13, 24, 33, 25, 21, 26, 26, 21, 8, 11, 17, 28, 35, 10, 23, 33, 33, 20, 37, 34, 26, 29, 14, 30, 28, 9, 7, 13, 34, 34, 21, 16, 8, 24, 26, 36, 36, 11, 25, 14, 22, 16, 24, 21, 19, 18, 33, 4, 13, 11, 13, 28, 27, 22, 34, 22, 29, 13, 19, 25, 10, 21, 11, 4, 18, 33, 19, 27, 26, 19, 15, 38, 16, 20, 4, 12, 19, 26, 26, 19, 18, 39, 16, 26, 16, 31, 8, 33, 13, 14, 36, 21, 26, 26, 33, 18, 19, 11, 9, 22, 23, 19, 16, 0, 37, 25, 31, 9, 13, 6, 38, 33, 5, 8, 36, 30, 29, 4, 21, 18, 13, 18, 27, 32, 30, 26, 6, 13, 30, 29, 22, 33, 31, 11, 29, 31, 13, 13, 11, 17, 15, 18, 14, 23, 20, 22, 38, 10, 4, 8, 11, 3, 36, 20, 16, 20, 5, 19, 10, 17, 24, 16, 36, 19, 31, 5, 14, 18, 5, 16, 6, 11, 24, 13, 6, 3, 14, 7, 1, 17, 5, 26, 29, 6, 27, 21, 2, 14, 17, 11, 16, 33, 25, 1, 4, 29, 29, 14, 11, 11, 22, 16, 3, 4, 13, 31, 17, 13, 6, 3, 12, 14, 24, 8, 18, 18, 1, 4, 24, 18, 7, 21, 4, 4, 18, 15, 24, 29, 24, 11, 9, 0, 4, 13, 14, 18, 1, 13, 19, 7, 12, 21, 8, 12, 9, 5, 21, 13, 3, 18, 34, 9, 0, 12, 8, 16, 7, 6, 3, 13, 12, 23, 4, 5, 13, 7, 14, 1, 3, 6, 12, 7, 11, 8, 14, 15, 11, 11, 25, 18, 8, 15, 13, 13, 6, 17, 13, 13, 11, 21, 5, 16, 9, 4, 8, 14, 4, 7, 8, 4, 4, 4, 1, 4, 3, 13, 7, 9, 16, 8, 15, 24, 7, 16, 12, 0, 3, 1, 8, 10, 5, 17, 0, 10, 7, 21, 13, 15, 7, 19, 21, 4, 18, 4, 19, 4, 14, 9, 10, 9, 11, 4, 5, 2, 21, 15, 11, 23, 9, 8, 8, 3, 4, 12, 13, 23, 5, 18, 4, 11, 21, 5, 21, 6, 9, 24, 7, 24, 4, 15, 1, 0, 7, 13, 1, 33, 8, 16, 10, 24, 1, 11, 8, 24, 0, 18, 14, 10, 13, 23, 15, 11, 13, 8, 4, 21, 13, 0, 13, 8, 19, 16, 4, 21, 6, 18, 7, 7, 2, 2, 3, 16, 6, 7, 22, 11, 20, 25, 17, 19, 11, 15, 25, 20, 9, 1, 7, 31, 19, 20, 16, 21, 11, 12, 11)
talent_scores[, 9] <- c(18, 13, 6, 24, 1, 9, 11, 2, 16, 3, 12, 23, 11, 19, 4, 6, 20, 14, 3, 20, 3, 14, 24, 6, 14, 14, 16, 17, 12, 10, 11, 19, 23, 11, 14, 13, 3, 10, 19, 11, 6, 9, 23, 4, 16, 0, 4, 23, 23, 21, 7, 20, 13, 7, 13, 16, 1, 24, 1, 24, 1, 6, 7, 14, 14, 8, 9, 1, 9, 13, 1, 4, 11, 9, 17, 13, 14, 3, 27, 8, 4, 17, 14, 10, 10, 6, 24, 30, 13, 17, 6, 16, 14, 3, 13, 1, 4, 24, 26, 16, 13, 0, 7, 17, 6, 24, 4, 4, 17, 10, 39, 19, 13, 6, 14, 20, 11, 1, 6, 6, 10, 13, 13, 13, 19, 14, 14, 17, 13, 2, 11, 14, 13, 0, 26, 19, 4, 14, 24, 14, 11, 1, 25, 21, 1, 27, 19, 14, 4, 13, 4, 24, 23, 23, 27, 20, 0, 20, 10, 16, 17, 23, 11, 21, 24, 10, 3, 4, 1, 13, 17, 10, 1, 1, 24, 10, 6, 0, 9, 3, 4, 11, 17, 3, 19, 31, 3, 6, 17, 27, 0, 17, 16, 16, 9, 7, 7, 21, 18, 6, 30, 13, 16, 7, 16, 14, 18, 1, 4, 6, 10, 20, 6, 14, 14, 36, 6, 0, 1, 7, 6, 0, 23, 0, 25, 10, 6, 11, 7, 7, 14, 7, 21, 10, 15, 11, 20, 17, 39, 36, 17, 23, 24, 37, 29, 15, 11, 36, 36, 27, 26, 24, 34, 13, 26, 31, 30, 23, 30, 19, 17, 37, 11, 17, 37, 31, 29, 37, 25, 3, 1, 30, 27, 24, 30, 13, 21, 19, 39, 27, 29, 11, 23, 40, 30, 19, 33, 34, 16, 33, 31, 24, 16, 23, 31, 27, 29, 26, 21, 10, 20, 26, 24, 39, 30, 24, 36, 19, 23, 15, 36, 29, 0, 9, 17, 33, 33, 27, 24, 10, 10, 29, 21, 29, 33, 24, 29, 34, 33, 21, 24, 39, 23, 36, 37, 30, 9, 6, 36, 19, 37, 6, 23, 26, 24, 29, 17, 26, 3, 20, 24, 7, 34, 23, 19, 17, 36, 33, 30, 31, 39, 37, 40, 30, 39, 30, 26, 21, 33, 33, 36, 14, 3, 30, 34, 27, 26, 26, 34, 16, 27, 26, 30, 16, 36, 29, 17, 16, 37, 19, 33, 39, 11, 0, 33, 19, 34, 3, 33, 36, 21, 40, 9, 40, 13, 31, 27, 37, 29, 26, 33, 40, 39, 29, 24, 37, 7, 23, 4, 27, 36, 37, 29, 14, 23, 27, 37, 26, 37, 35, 24, 4, 33, 33, 23, 13, 24, 34, 24, 16, 30, 21, 29, 23, 26, 7, 29, 34, 21, 37, 36, 10, 6, 23, 26, 11, 29, 23, 24, 39, 39, 31, 13, 33, 37, 24, 4, 36, 30, 31, 9, 23, 33, 36, 19, 29, 10, 34, 31, 33, 30, 34, 24, 34, 17, 19, 7, 40, 23, 20, 6, 26, 26, 23, 37, 29, 34, 39, 11, 27, 26, 11, 11, 37, 27)
talent_scores <- as.data.frame(talent_scores)
names(talent_scores) <- c('english', 'reading', 'creativity', 'mechanical', 'abstract_reasoning', 'math', 'social', 'physical_science', 'office_inventory')
# Compute the mean of the scores for each student individually
rowMeans(talent_scores)
## [1] 24.88889 18.33333 21.77778 28.11111 26.77778 20.66667 22.44444
## [8] 32.88889 29.00000 26.77778 27.22222 25.00000 26.88889 26.88889
## [15] 27.66667 18.88889 26.33333 26.66667 21.66667 28.22222 22.00000
## [22] 26.88889 20.88889 28.22222 29.77778 21.88889 27.00000 32.00000
## [29] 22.00000 29.55556 30.22222 23.00000 25.00000 22.00000 31.00000
## [36] 32.22222 24.22222 26.00000 29.00000 27.55556 25.44444 26.77778
## [43] 31.77778 15.66667 24.55556 27.77778 18.66667 20.33333 27.44444
## [50] 29.11111 20.00000 22.55556 14.77778 18.55556 18.33333 16.55556
## [57] 17.55556 28.00000 24.55556 26.77778 24.55556 27.00000 31.66667
## [64] 24.33333 21.66667 26.55556 29.11111 16.66667 29.00000 28.33333
## [71] 19.00000 26.77778 29.55556 26.55556 19.22222 23.88889 30.66667
## [78] 17.66667 23.22222 23.66667 20.44444 29.22222 24.00000 20.44444
## [85] 20.77778 32.33333 32.00000 17.77778 29.88889 31.33333 22.11111
## [92] 24.33333 22.33333 30.77778 28.44444 20.11111 21.66667 28.00000
## [99] 26.66667 22.33333 22.66667 22.88889 19.22222 26.77778 25.44444
## [106] 32.22222 30.22222 21.44444 23.77778 21.55556 17.77778 20.22222
## [113] 17.33333 20.88889 21.33333 21.77778 27.44444 17.66667 23.88889
## [120] 23.33333 17.00000 30.00000 27.55556 22.88889 34.22222 23.77778
## [127] 28.55556 23.88889 24.66667 27.55556 19.66667 23.11111 23.44444
## [134] 18.66667 25.88889 31.33333 21.55556 30.66667 29.11111 17.11111
## [141] 20.22222 31.66667 25.00000 27.77778 16.11111 20.00000 24.11111
## [148] 26.66667 26.11111 21.11111 23.00000 31.00000 17.66667 21.44444
## [155] 16.33333 23.55556 21.00000 23.55556 15.77778 20.88889 29.88889
## [162] 28.88889 27.77778 20.44444 27.55556 16.55556 22.00000 27.44444
## [169] 18.55556 20.55556 25.66667 21.00000 20.66667 17.33333 27.88889
## [176] 17.66667 28.88889 13.00000 22.44444 19.11111 25.22222 29.11111
## [183] 25.11111 20.11111 34.44444 25.44444 22.55556 24.11111 20.22222
## [190] 25.00000 18.66667 23.00000 29.77778 25.55556 30.00000 19.33333
## [197] 21.33333 22.22222 26.11111 27.55556 14.55556 29.88889 25.77778
## [204] 21.55556 30.11111 23.44444 19.00000 18.77778 24.00000 17.55556
## [211] 23.77778 22.55556 15.77778 21.66667 28.11111 32.22222 31.77778
## [218] 21.00000 21.77778 19.88889 20.88889 19.00000 30.55556 22.11111
## [225] 24.44444 25.22222 18.11111 16.88889 17.77778 24.11111 23.77778
## [232] 10.55556 33.33333 26.88889 24.33333 23.00000 26.55556 30.33333
## [239] 21.77778 28.33333 15.44444 25.66667 14.44444 21.55556 26.88889
## [246] 19.66667 28.88889 21.33333 25.77778 21.66667 25.22222 31.33333
## [253] 29.55556 21.11111 27.66667 22.88889 18.44444 29.55556 22.77778
## [260] 19.66667 21.66667 25.66667 26.00000 22.88889 20.88889 31.66667
## [267] 33.66667 28.22222 21.66667 23.22222 28.77778 30.00000 21.77778
## [274] 28.00000 24.55556 30.22222 28.66667 22.66667 19.22222 15.55556
## [281] 25.22222 21.66667 30.11111 24.11111 21.77778 29.66667 20.88889
## [288] 19.88889 27.77778 29.44444 28.00000 25.77778 25.11111 24.66667
## [295] 24.33333 24.44444 22.66667 29.88889 18.44444 25.00000 27.00000
## [302] 21.44444 23.33333 26.55556 21.77778 32.33333 26.88889 23.11111
## [309] 26.22222 20.33333 25.44444 25.22222 13.88889 25.44444 27.00000
## [316] 22.11111 31.88889 29.44444 22.22222 26.88889 33.88889 23.11111
## [323] 18.11111 27.11111 22.77778 26.11111 22.77778 22.66667 24.44444
## [330] 23.77778 19.88889 26.88889 21.44444 20.55556 25.44444 23.66667
## [337] 29.22222 17.55556 26.77778 18.55556 24.88889 19.88889 26.22222
## [344] 21.88889 19.44444 22.22222 25.22222 20.11111 28.66667 26.77778
## [351] 18.44444 21.22222 22.55556 27.22222 27.33333 19.44444 25.88889
## [358] 31.00000 25.44444 25.00000 22.88889 27.66667 27.44444 23.22222
## [365] 25.77778 22.88889 27.00000 24.77778 20.22222 20.77778 20.88889
## [372] 21.11111 19.88889 21.11111 22.00000 24.66667 21.33333 24.33333
## [379] 25.77778 24.88889 25.88889 30.44444 20.33333 24.44444 26.33333
## [386] 20.00000 19.44444 19.88889 31.22222 19.22222 25.11111 29.11111
## [393] 17.44444 25.22222 26.33333 19.33333 24.00000 27.11111 22.88889
## [400] 25.44444 22.55556 21.44444 28.55556 22.88889 26.00000 17.77778
## [407] 24.33333 28.33333 19.55556 16.77778 21.22222 24.66667 22.88889
## [414] 25.33333 24.66667 22.66667 25.11111 25.44444 22.66667 28.44444
## [421] 20.44444 25.55556 28.22222 20.55556 24.33333 23.77778 22.55556
## [428] 25.22222 22.55556 22.33333 27.77778 19.11111 30.55556 18.55556
## [435] 26.00000 31.44444 15.77778 25.11111 24.00000 20.33333 25.11111
## [442] 18.55556 23.55556 19.88889 19.11111 26.88889 24.66667 20.44444
## [449] 24.22222 33.77778 25.00000 25.77778 19.77778 24.55556 22.88889
## [456] 23.77778 30.11111 18.77778 26.55556 31.33333 29.00000 21.77778
## [463] 29.88889 25.33333 22.66667 25.00000 24.88889 29.22222 26.22222
## [470] 23.77778 24.66667 20.88889 22.77778 25.11111 22.44444 23.44444
## [477] 24.55556 22.22222 27.66667 23.22222 23.44444 27.00000 24.11111
## [484] 24.00000 30.44444 25.77778 23.00000 25.66667 21.00000 25.33333
## [491] 21.77778 24.33333 28.66667 25.66667 29.22222 18.55556 19.44444
## [498] 33.55556 28.44444 26.66667 20.22222 16.55556 23.88889 22.55556
## [505] 27.88889
# Compute the mean of the scores for each course individually
colMeans(talent_scores)
## english reading creativity
## 86.112871 33.732673 9.845545
## mechanical abstract_reasoning math
## 11.146535 9.542574 24.348515
## social physical_science office_inventory
## 7.176238 16.324752 19.263366
# Compute the score each student has gained for all his courses
rowSums(talent_scores)
## [1] 224 165 196 253 241 186 202 296 261 241 245 225 242 242 249 170 237
## [18] 240 195 254 198 242 188 254 268 197 243 288 198 266 272 207 225 198
## [35] 279 290 218 234 261 248 229 241 286 141 221 250 168 183 247 262 180
## [52] 203 133 167 165 149 158 252 221 241 221 243 285 219 195 239 262 150
## [69] 261 255 171 241 266 239 173 215 276 159 209 213 184 263 216 184 187
## [86] 291 288 160 269 282 199 219 201 277 256 181 195 252 240 201 204 206
## [103] 173 241 229 290 272 193 214 194 160 182 156 188 192 196 247 159 215
## [120] 210 153 270 248 206 308 214 257 215 222 248 177 208 211 168 233 282
## [137] 194 276 262 154 182 285 225 250 145 180 217 240 235 190 207 279 159
## [154] 193 147 212 189 212 142 188 269 260 250 184 248 149 198 247 167 185
## [171] 231 189 186 156 251 159 260 117 202 172 227 262 226 181 310 229 203
## [188] 217 182 225 168 207 268 230 270 174 192 200 235 248 131 269 232 194
## [205] 271 211 171 169 216 158 214 203 142 195 253 290 286 189 196 179 188
## [222] 171 275 199 220 227 163 152 160 217 214 95 300 242 219 207 239 273
## [239] 196 255 139 231 130 194 242 177 260 192 232 195 227 282 266 190 249
## [256] 206 166 266 205 177 195 231 234 206 188 285 303 254 195 209 259 270
## [273] 196 252 221 272 258 204 173 140 227 195 271 217 196 267 188 179 250
## [290] 265 252 232 226 222 219 220 204 269 166 225 243 193 210 239 196 291
## [307] 242 208 236 183 229 227 125 229 243 199 287 265 200 242 305 208 163
## [324] 244 205 235 205 204 220 214 179 242 193 185 229 213 263 158 241 167
## [341] 224 179 236 197 175 200 227 181 258 241 166 191 203 245 246 175 233
## [358] 279 229 225 206 249 247 209 232 206 243 223 182 187 188 190 179 190
## [375] 198 222 192 219 232 224 233 274 183 220 237 180 175 179 281 173 226
## [392] 262 157 227 237 174 216 244 206 229 203 193 257 206 234 160 219 255
## [409] 176 151 191 222 206 228 222 204 226 229 204 256 184 230 254 185 219
## [426] 214 203 227 203 201 250 172 275 167 234 283 142 226 216 183 226 167
## [443] 212 179 172 242 222 184 218 304 225 232 178 221 206 214 271 169 239
## [460] 282 261 196 269 228 204 225 224 263 236 214 222 188 205 226 202 211
## [477] 221 200 249 209 211 243 217 216 274 232 207 231 189 228 196 219 258
## [494] 231 263 167 175 302 256 240 182 149 215 203 251
# Compute the total score that is gained by the students on each course
colSums(talent_scores)
## english reading creativity
## 43487 17035 4972
## mechanical abstract_reasoning math
## 5629 4819 12296
## social physical_science office_inventory
## 3624 8244 9728
# DO NOT HAVE THIS DATASET
scores_english <- talent_scores$english
# Compute the mean of the English scores
mean(scores_english, na.rm=TRUE)
## [1] 86.11287
# Compute the median of the English scores
median(scores_english, na.rm=TRUE)
## [1] 87
# Compute the variance between the English scores
var(scores_english, na.rm=TRUE)
## [1] 148.7591
# Compute the standard deviation between the English scores
sd(scores_english, na.rm=TRUE)
## [1] 12.19668
# Compute the minimum of the English scores
min(scores_english, na.rm=TRUE)
## [1] 0
# Compute the maximum of the English scores
max(scores_english, na.rm=TRUE)
## [1] 111
# Summary statistics for all variables - 5 digits
summary(talent_scores, digits=5)
## english reading creativity mechanical
## Min. : 0.000 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 80.000 1st Qu.:27.000 1st Qu.: 7.0000 1st Qu.: 8.000
## Median : 87.000 Median :36.000 Median :10.0000 Median :11.000
## Mean : 86.113 Mean :33.733 Mean : 9.8455 Mean :11.147
## 3rd Qu.: 94.000 3rd Qu.:41.000 3rd Qu.:13.0000 3rd Qu.:14.000
## Max. :111.000 Max. :48.000 Max. :19.0000 Max. :20.000
## abstract_reasoning math social physical_science
## Min. : 0.0000 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 8.0000 1st Qu.:17.000 1st Qu.: 5.0000 1st Qu.: 8.000
## Median :10.0000 Median :23.000 Median : 7.0000 Median :15.000
## Mean : 9.5426 Mean :24.349 Mean : 7.1762 Mean :16.325
## 3rd Qu.:11.0000 3rd Qu.:31.000 3rd Qu.:10.0000 3rd Qu.:23.000
## Max. :15.0000 Max. :52.000 Max. :12.0000 Max. :40.000
## office_inventory
## Min. : 0.000
## 1st Qu.:10.000
## Median :19.000
## Mean :19.263
## 3rd Qu.:29.000
## Max. :40.000
# Summary statistics for all variables - 10 digits
summary(talent_scores, digits=10)
## english reading creativity
## Min. : 0.00000 Min. : 0.00000 Min. : 0.000000
## 1st Qu.: 80.00000 1st Qu.:27.00000 1st Qu.: 7.000000
## Median : 87.00000 Median :36.00000 Median :10.000000
## Mean : 86.11287 Mean :33.73267 Mean : 9.845545
## 3rd Qu.: 94.00000 3rd Qu.:41.00000 3rd Qu.:13.000000
## Max. :111.00000 Max. :48.00000 Max. :19.000000
## mechanical abstract_reasoning math
## Min. : 0.000000 Min. : 0.000000 Min. : 0.00000
## 1st Qu.: 8.000000 1st Qu.: 8.000000 1st Qu.:17.00000
## Median :11.000000 Median :10.000000 Median :23.00000
## Mean :11.146535 Mean : 9.542574 Mean :24.34851
## 3rd Qu.:14.000000 3rd Qu.:11.000000 3rd Qu.:31.00000
## Max. :20.000000 Max. :15.000000 Max. :52.00000
## social physical_science office_inventory
## Min. : 0.000000 Min. : 0.00000 Min. : 0.00000
## 1st Qu.: 5.000000 1st Qu.: 8.00000 1st Qu.:10.00000
## Median : 7.000000 Median :15.00000 Median :19.00000
## Mean : 7.176238 Mean :16.32475 Mean :19.26337
## 3rd Qu.:10.000000 3rd Qu.:23.00000 3rd Qu.:29.00000
## Max. :12.000000 Max. :40.00000 Max. :40.00000
# Summary statistics for variables containing "cal". Calculate the statistics to 4 significant digits
summary(dplyr::select(talent_scores, dplyr::contains("cal")), digits=4)
## mechanical physical_science
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.: 8.00
## Median :11.00 Median :15.00
## Mean :11.15 Mean :16.32
## 3rd Qu.:14.00 3rd Qu.:23.00
## Max. :20.00 Max. :40.00
# Summary statistics for variables containing "rea". Calculate the statistics to 4 significant digits
summary(dplyr::select(talent_scores, dplyr::contains("rea")), digits=4)
## reading creativity abstract_reasoning
## Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.:27.00 1st Qu.: 7.000 1st Qu.: 8.000
## Median :36.00 Median :10.000 Median :10.000
## Mean :33.73 Mean : 9.846 Mean : 9.543
## 3rd Qu.:41.00 3rd Qu.:13.000 3rd Qu.:11.000
## Max. :48.00 Max. :19.000 Max. :15.000
# RcmdrMisc::numSummary(talent_scores)
# Describe the `talent_scores` dataset
Hmisc::describe(talent_scores)
## talent_scores
##
## 9 Variables 505 Observations
## ---------------------------------------------------------------------------
## english
## n missing distinct Info Mean Gmd .05 .10
## 505 0 61 0.999 86.11 12.77 67 73
## .25 .50 .75 .90 .95
## 80 87 94 99 103
##
## lowest : 0 1 40 43 47, highest: 107 108 109 110 111
## ---------------------------------------------------------------------------
## reading
## n missing distinct Info Mean Gmd .05 .10
## 505 0 41 0.999 33.73 10.37 17 21
## .25 .50 .75 .90 .95
## 27 36 41 44 46
##
## lowest : 0 4 5 7 11, highest: 44 45 46 47 48
## ---------------------------------------------------------------------------
## creativity
## n missing distinct Info Mean Gmd .05 .10
## 505 0 20 0.994 9.846 4.417 4 5
## .25 .50 .75 .90 .95
## 7 10 13 15 16
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 1 5 8 10 20 26 25 47 51 54
## Proportion 0.002 0.010 0.016 0.020 0.040 0.051 0.050 0.093 0.101 0.107
##
## Value 10 11 12 13 14 15 16 17 18 19
## Frequency 46 46 27 44 27 26 22 7 10 3
## Proportion 0.091 0.091 0.053 0.087 0.053 0.051 0.044 0.014 0.020 0.006
## ---------------------------------------------------------------------------
## mechanical
## n missing distinct Info Mean Gmd .05 .10
## 505 0 21 0.995 11.15 4.788 4 6
## .25 .50 .75 .90 .95
## 8 11 14 17 18
##
## lowest : 0 1 2 3 4, highest: 16 17 18 19 20
## ---------------------------------------------------------------------------
## abstract_reasoning
## n missing distinct Info Mean Gmd .05 .10
## 505 0 16 0.986 9.543 3.08 4 6
## .25 .50 .75 .90 .95
## 8 10 11 13 14
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 2 2 6 7 12 13 22 43 48 73
## Proportion 0.004 0.004 0.012 0.014 0.024 0.026 0.044 0.085 0.095 0.145
##
## Value 10 11 12 13 14 15
## Frequency 75 76 64 33 21 8
## Proportion 0.149 0.150 0.127 0.065 0.042 0.016
## ---------------------------------------------------------------------------
## math
## n missing distinct Info Mean Gmd .05 .10
## 505 0 49 0.999 24.35 11.31 11 13
## .25 .50 .75 .90 .95
## 17 23 31 39 43
##
## lowest : 0 5 6 7 8, highest: 48 49 50 51 52
## ---------------------------------------------------------------------------
## social
## n missing distinct Info Mean Gmd .05 .10
## 505 0 13 0.989 7.176 3.287 2 3
## .25 .50 .75 .90 .95
## 5 7 10 11 11
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 3 13 23 30 32 45 42 69 56 55
## Proportion 0.006 0.026 0.046 0.059 0.063 0.089 0.083 0.137 0.111 0.109
##
## Value 10 11 12
## Frequency 78 44 15
## Proportion 0.154 0.087 0.030
## ---------------------------------------------------------------------------
## physical_science
## n missing distinct Info Mean Gmd .05 .10
## 505 0 41 0.999 16.32 10.95 3 4
## .25 .50 .75 .90 .95
## 8 15 23 31 34
##
## lowest : 0 1 2 3 4, highest: 36 37 38 39 40
## ---------------------------------------------------------------------------
## office_inventory
## n missing distinct Info Mean Gmd .05 .10
## 505 0 36 0.999 19.26 12.79 1.0 4.0
## .25 .50 .75 .90 .95
## 10.0 19.0 29.0 35.6 37.0
##
## lowest : 0 1 2 3 4, highest: 35 36 37 39 40
## ---------------------------------------------------------------------------
# Describe the `businesshours` dataset
Hmisc::describe(businesshours)
## businesshours
##
## 6 Variables 8 Observations
## ---------------------------------------------------------------------------
## QR1
## n missing distinct Info Mean Gmd
## 8 0 6 0.976 34.38 2.036
##
## Value 32 33 34 35 36 37
## Frequency 1 2 1 2 1 1
## Proportion 0.125 0.250 0.125 0.250 0.125 0.125
## ---------------------------------------------------------------------------
## QR2
## n missing distinct Info Mean Gmd
## 8 0 5 0.94 35.25 2.071
##
## Value 33 35 36 37 38
## Frequency 2 3 1 1 1
## Proportion 0.250 0.375 0.125 0.125 0.125
## ---------------------------------------------------------------------------
## QR3
## n missing distinct Info Mean Gmd
## 8 0 5 0.964 37.12 2.321
##
## Value 35 36 37 39 40
## Frequency 2 2 1 2 1
## Proportion 0.250 0.250 0.125 0.250 0.125
## ---------------------------------------------------------------------------
## QR4
## n missing distinct Info Mean Gmd
## 8 0 6 0.976 34.38 2.321
##
## Value 32 33 34 35 36 37
## Frequency 2 1 1 1 2 1
## Proportion 0.250 0.125 0.125 0.125 0.250 0.125
## ---------------------------------------------------------------------------
## country
## n missing distinct Info Mean Gmd
## 8 0 2 0.762 1.5 0.5714
##
## Value 1 2
## Frequency 4 4
## Proportion 0.5 0.5
## ---------------------------------------------------------------------------
## period
## n missing distinct
## 8 0 2
##
## Value ab bc
## Frequency 4 4
## Proportion 0.5 0.5
## ---------------------------------------------------------------------------
# DATASET "talent" has gender and fulltime
talent <- data.frame(gender = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
fulltime = c(4, 5, 5, 3, 1, 5, 1, 2, 4, 2, 1, 2, 5, 1, 5, 5, 4, 2, 3, 1, 4, 1, 4, 1, 1, 3, 4, 1, 1, 1, 2, 4, 3, 4, 1, 1, 3, 4, 1, 1, 5, 1, 1, 3, 3, 2, 5, 4, 3, 3, 5, 4, 5, 5, 1, 4, 4, 3, 4, 5, 5, 1, 1, 4, 4, 5, 1, 4, 1, 5, 5, 1, 2, 1, 4, 3, 1, 4, 4, 3, 5, 2, 1, 4, 5, 1, 1, 4, 1, 2, 3, 1, 4, 1, 2, 5, 3, 1, 5, 2, 4, 5, 5, 1, 5, 2, 1, 4, 4, 5, 2, 5, 2, 2, 5, 5, 1, 5, 1, 4, 5, 2, 1, 5, 1, 5, 3, 4, 4, 2, 5, 5, 1, 4, 3, 1, 2, 1, 1, 5, 4, 1, 1, 3, 5, 5, 3, 3, 2, 2, 2, 2, 3, 3, 5, 4, 1, 2, 4, 5, 2, 1, 1, 3, 1, 1, 5, 3, 5, 4, 1, 3, 4, 3, 3, 5, 1, 4, 4, 5, 1, 3, 1, 5, 1, 4, 3, 1, 1, 1, 5, 4, 2, 1, 1, 1, 3, 3, 1, 2, 5, 1, 3, 5, 1, 1, 5, 4, 4, 5, 3, 4, 5, 1, 2, 4, 2, 3, 2, 2, 1, 5, 1, 1, 2, 2, 4, 5, 5, 2, 3, 1, 1, 5, 3, 5, 4, 1, 5, 5, 4, 1, 5, 5, 1, 2, 1, 5, 5, 5, 3, 2, 5, 5, 1, 1, 5, 1, 4, 1, 4, 5, 1, 4, 5, 1, 1, 5, 5, 5, 1, 1, 5, 5, 5, 5, 2, 5, 5, 5, 4, 1, 1, 5, 5, 5, 5, 5, 1, 4, 1, 1, 5, 5, 4, 5, 5, 2, 1, 1, 1, 5, 1, 4, 5, 1, 5, 5, 5, 4, 5, 1, 5, 1, 4, 5, 4, 2, 5, 1, 1, 5, 5, 5, 3, 4, 4, 4, 1, 2, 5, 1, 5, 4, 5, 5, 2, 5, 4, 4, 5, 2, 1, 1, 5, 5, 5, 5, 2, 1, 4, 1, 4, 5, 1, 1, 4, 1, 4, 4, 5, 5, 5, 5, 3, 5, 1, 1, 5, 5, 1, 1, 1, 4, 5, 1, 5, 4, 4, 1, 4, 2, 3, 2, 5, 5, 4, 4, 2, 5, 5, 2, 5, 2, 5, 4, 1, 1, 4, 4, 4, 5, 1, 5, 5, 1, 1, 4, 4, 3, 3, 4, 5, 5, 1, 1, 2, 1, 1, 1, 4, 4, 5, 1, 4, 1, 1, 1, 4, 2, 2, 1, 1, 4, 1, 2, 5, 3, 4, 5, 2, 5, 3, 4, 3, 1, 3, 5, 5, 4, 5, 1, 4, 1, 4, 2, 1, 1, 1, 1, 5, 4, 1, 5, 4, 1, 1, 1, 4, 4, 5, 5, 1, 5, 3, 3, 2, 5, 5, 5, 5, 5, 5, 4, 1, 3, 2, 5, 1, 1, 3, 2, 5, 1, 1, 5, 3, 4, 1, 1, 4, 5, 1, 5, 1),
stringsAsFactors = FALSE)
# Generate a two-way contingency table of gender and fulltime and print the output
gender_fulltime <- table(talent)
# Obtain the Pearson's Chi-Squared test, the number of observations and the number of factors for the table
summary(gender_fulltime)
## Number of cases in table: 505
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 20.713, df = 4, p-value = 0.000361
# Obtain the Pearson's Chi-Squared test for the table
chisq.test(gender_fulltime)
##
## Pearson's Chi-squared test
##
## data: gender_fulltime
## X-squared = 20.713, df = 4, p-value = 0.000361
# Generate a two-way contingency table of gender and fulltime with proportions
prop.table(gender_fulltime)
## fulltime
## gender 1 2 3 4 5
## 1 0.13861386 0.06534653 0.06732673 0.08712871 0.10495050
## 2 0.16039604 0.04554455 0.03168317 0.10495050 0.19405941
# Add the margins to `gender_fulltime`
addmargins(gender_fulltime)
## fulltime
## gender 1 2 3 4 5 Sum
## 1 70 33 34 44 53 234
## 2 81 23 16 53 98 271
## Sum 151 56 50 97 151 505
# Load the gmodels package
# library("gmodels")
# Generate a cross table of gender and fulltime
gender_fulltime_2 <- gmodels::CrossTable(talent$fulltime, talent$gender)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 505
##
##
## | talent$gender
## talent$fulltime | 1 | 2 | Row Total |
## ----------------|-----------|-----------|-----------|
## 1 | 70 | 81 | 151 |
## | 0.000 | 0.000 | |
## | 0.464 | 0.536 | 0.299 |
## | 0.299 | 0.299 | |
## | 0.139 | 0.160 | |
## ----------------|-----------|-----------|-----------|
## 2 | 33 | 23 | 56 |
## | 1.916 | 1.655 | |
## | 0.589 | 0.411 | 0.111 |
## | 0.141 | 0.085 | |
## | 0.065 | 0.046 | |
## ----------------|-----------|-----------|-----------|
## 3 | 34 | 16 | 50 |
## | 5.064 | 4.373 | |
## | 0.680 | 0.320 | 0.099 |
## | 0.145 | 0.059 | |
## | 0.067 | 0.032 | |
## ----------------|-----------|-----------|-----------|
## 4 | 44 | 53 | 97 |
## | 0.020 | 0.017 | |
## | 0.454 | 0.546 | 0.192 |
## | 0.188 | 0.196 | |
## | 0.087 | 0.105 | |
## ----------------|-----------|-----------|-----------|
## 5 | 53 | 98 | 151 |
## | 4.115 | 3.553 | |
## | 0.351 | 0.649 | 0.299 |
## | 0.226 | 0.362 | |
## | 0.105 | 0.194 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 234 | 271 | 505 |
## | 0.463 | 0.537 | |
## ----------------|-----------|-----------|-----------|
##
##
gender_fulltime_2
## $t
## y
## x 1 2
## 1 70 81
## 2 33 23
## 3 34 16
## 4 44 53
## 5 53 98
##
## $prop.row
## y
## x 1 2
## 1 0.4635762 0.5364238
## 2 0.5892857 0.4107143
## 3 0.6800000 0.3200000
## 4 0.4536082 0.5463918
## 5 0.3509934 0.6490066
##
## $prop.col
## y
## x 1 2
## 1 0.29914530 0.29889299
## 2 0.14102564 0.08487085
## 3 0.14529915 0.05904059
## 4 0.18803419 0.19557196
## 5 0.22649573 0.36162362
##
## $prop.tbl
## y
## x 1 2
## 1 0.13861386 0.16039604
## 2 0.06534653 0.04554455
## 3 0.06732673 0.03168317
## 4 0.08712871 0.10495050
## 5 0.10495050 0.19405941
# Generate a crosstable for gender and fulltime in which only the results for the chi-square test are included, and the row proportions.
gmodels::CrossTable(talent$fulltime,talent$gender, prop.c = FALSE, prop.t = FALSE, chisq = TRUE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 505
##
##
## | talent$gender
## talent$fulltime | 1 | 2 | Row Total |
## ----------------|-----------|-----------|-----------|
## 1 | 70 | 81 | 151 |
## | 0.464 | 0.536 | 0.299 |
## ----------------|-----------|-----------|-----------|
## 2 | 33 | 23 | 56 |
## | 0.589 | 0.411 | 0.111 |
## ----------------|-----------|-----------|-----------|
## 3 | 34 | 16 | 50 |
## | 0.680 | 0.320 | 0.099 |
## ----------------|-----------|-----------|-----------|
## 4 | 44 | 53 | 97 |
## | 0.454 | 0.546 | 0.192 |
## ----------------|-----------|-----------|-----------|
## 5 | 53 | 98 | 151 |
## | 0.351 | 0.649 | 0.299 |
## ----------------|-----------|-----------|-----------|
## Column Total | 234 | 271 | 505 |
## ----------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 20.71298 d.f. = 4 p = 0.0003609747
##
##
##
# Generate a cross table of gender and fulltime in SAS format
gmodels::CrossTable(talent$fulltime,talent$gender, format = "SAS")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 505
##
##
## | talent$gender
## talent$fulltime | 1 | 2 | Row Total |
## ----------------|-----------|-----------|-----------|
## 1 | 70 | 81 | 151 |
## | 0.000 | 0.000 | |
## | 0.464 | 0.536 | 0.299 |
## | 0.299 | 0.299 | |
## | 0.139 | 0.160 | |
## ----------------|-----------|-----------|-----------|
## 2 | 33 | 23 | 56 |
## | 1.916 | 1.655 | |
## | 0.589 | 0.411 | 0.111 |
## | 0.141 | 0.085 | |
## | 0.065 | 0.046 | |
## ----------------|-----------|-----------|-----------|
## 3 | 34 | 16 | 50 |
## | 5.064 | 4.373 | |
## | 0.680 | 0.320 | 0.099 |
## | 0.145 | 0.059 | |
## | 0.067 | 0.032 | |
## ----------------|-----------|-----------|-----------|
## 4 | 44 | 53 | 97 |
## | 0.020 | 0.017 | |
## | 0.454 | 0.546 | 0.192 |
## | 0.188 | 0.196 | |
## | 0.087 | 0.105 | |
## ----------------|-----------|-----------|-----------|
## 5 | 53 | 98 | 151 |
## | 4.115 | 3.553 | |
## | 0.351 | 0.649 | 0.299 |
## | 0.226 | 0.362 | |
## | 0.105 | 0.194 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 234 | 271 | 505 |
## | 0.463 | 0.537 | |
## ----------------|-----------|-----------|-----------|
##
##
# Generate a cross table of gender and fulltime in SPSS format
gmodels::CrossTable(talent$fulltime,talent$gender, format = "SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 505
##
## | talent$gender
## talent$fulltime | 1 | 2 | Row Total |
## ----------------|-----------|-----------|-----------|
## 1 | 70 | 81 | 151 |
## | 0.000 | 0.000 | |
## | 46.358% | 53.642% | 29.901% |
## | 29.915% | 29.889% | |
## | 13.861% | 16.040% | |
## ----------------|-----------|-----------|-----------|
## 2 | 33 | 23 | 56 |
## | 1.916 | 1.655 | |
## | 58.929% | 41.071% | 11.089% |
## | 14.103% | 8.487% | |
## | 6.535% | 4.554% | |
## ----------------|-----------|-----------|-----------|
## 3 | 34 | 16 | 50 |
## | 5.064 | 4.373 | |
## | 68.000% | 32.000% | 9.901% |
## | 14.530% | 5.904% | |
## | 6.733% | 3.168% | |
## ----------------|-----------|-----------|-----------|
## 4 | 44 | 53 | 97 |
## | 0.020 | 0.017 | |
## | 45.361% | 54.639% | 19.208% |
## | 18.803% | 19.557% | |
## | 8.713% | 10.495% | |
## ----------------|-----------|-----------|-----------|
## 5 | 53 | 98 | 151 |
## | 4.115 | 3.553 | |
## | 35.099% | 64.901% | 29.901% |
## | 22.650% | 36.162% | |
## | 10.495% | 19.406% | |
## ----------------|-----------|-----------|-----------|
## Column Total | 234 | 271 | 505 |
## | 46.337% | 53.663% | |
## ----------------|-----------|-----------|-----------|
##
##
Chapter 18 - Correlation and Regression
Correlation and significance:
Modeling functions - different approach:
Get the output - different functions for obtaining more of the output:
Common regression models:
Example code (not run due to lack of dataset) includes:
oldTalent <- talent
talent <- talent_scores
talent$gender <- oldTalent$gender
talent$fulltime <- oldTalent$fulltime
# Read the variables names
names(talent)
## [1] "english" "reading" "creativity"
## [4] "mechanical" "abstract_reasoning" "math"
## [7] "social" "physical_science" "office_inventory"
## [10] "gender" "fulltime"
# Create a subset of the dataframe talent, talent_selected, containing reading, english and creativity (in that order).
talent_selected <- subset(talent, select = c("reading", "english", "creativity"))
# Compute the correlations among reading, english and creativity
cor(talent_selected)
## reading english creativity
## reading 1.0000000 0.5548396 0.5971977
## english 0.5548396 1.0000000 0.3932995
## creativity 0.5971977 0.3932995 1.0000000
# Compute the p-values for all pairwise comparisons
cor.test(talent_selected$english, talent_selected$reading, use="pairwise")
##
## Pearson's product-moment correlation
##
## data: talent_selected$english and talent_selected$reading
## t = 14.957, df = 503, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4913733 0.6124447
## sample estimates:
## cor
## 0.5548396
cor.test(talent_selected$english, talent_selected$creativity, use="pairwise")
##
## Pearson's product-moment correlation
##
## data: talent_selected$english and talent_selected$creativity
## t = 9.594, df = 503, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3169203 0.4646103
## sample estimates:
## cor
## 0.3932995
cor.test(talent_selected$reading, talent_selected$creativity, use="pairwise")
##
## Pearson's product-moment correlation
##
## data: talent_selected$reading and talent_selected$creativity
## t = 16.698, df = 503, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5379758 0.6505534
## sample estimates:
## cor
## 0.5971977
# Create a subset of the dataframe businesshours, businesshours_selected, containing QR1, QR2, QR3 and QR4 (in that order)
businesshours_selected <- dplyr::select(businesshours, QR1, QR2, QR3, QR4)
# Compute the correlations among QR1, QR2, QR3 and QR4
cor(businesshours_selected)
## QR1 QR2 QR3 QR4
## QR1 1.0000000 0.9312321 0.8924135 0.9205190
## QR2 0.9312321 1.0000000 0.9464288 0.9009476
## QR3 0.8924135 0.9464288 1.0000000 0.7821311
## QR4 0.9205190 0.9009476 0.7821311 1.0000000
# Test the **significance** of the correlations among `QR1` and `QR2`
cor.test(businesshours_selected$QR1, businesshours_selected$QR2, use="pairwise")
##
## Pearson's product-moment correlation
##
## data: businesshours_selected$QR1 and businesshours_selected$QR2
## t = 6.2593, df = 6, p-value = 0.0007717
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6590123 0.9877377
## sample estimates:
## cor
## 0.9312321
# model_erc: regress english on reading and creativity
model_erc <- lm(english ~ reading + creativity, data=talent)
# Compute the summary statistics for model
summary(model_erc)
##
## Call:
## lm(formula = english ~ reading + creativity, data = talent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.441 -4.757 0.658 5.916 35.104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.8956 1.7246 35.311 <2e-16 ***
## reading 0.6593 0.0611 10.790 <2e-16 ***
## creativity 0.3025 0.1448 2.089 0.0372 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.12 on 502 degrees of freedom
## Multiple R-squared: 0.3138, Adjusted R-squared: 0.3111
## F-statistic: 114.8 on 2 and 502 DF, p-value: < 2.2e-16
# Perform an analysis of variance on model
anova(model_erc)
## Analysis of Variance Table
##
## Response: english
## Df Sum Sq Mean Sq F value Pr(>F)
## reading 1 23081 23080.7 225.2142 < 2e-16 ***
## creativity 1 447 447.3 4.3642 0.03721 *
## Residuals 502 51447 102.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Produce diagnostic plots for model
plot(model_erc)
# Predict based on the fitted function model_erc
predict(model_erc)
## 1 2 3 4 5 6 7 8
## 89.32963 72.90225 81.77520 95.81384 90.29142 77.76541 85.37405 96.66720
## 9 10 11 12 13 14 15 16
## 91.60995 90.18299 92.21500 85.01731 91.44730 91.66416 91.85826 85.42826
## 17 18 19 20 21 22 23 24
## 91.25321 89.27542 83.50469 93.06835 85.01731 90.75659 79.44067 92.87426
## 25 26 27 28 29 30 31 32
## 94.08436 80.40246 88.86446 93.58774 81.11594 89.32963 95.34867 88.56194
## 33 34 35 36 37 38 39 40
## 84.76900 81.72099 94.68941 94.30122 90.07456 92.51752 92.46331 94.13857
## 41 42 43 44 45 46 47 48
## 91.96669 91.19899 95.04615 76.36122 84.76900 97.98573 85.42826 78.47889
## 49 50 51 52 53 54 55 56
## 90.84225 88.25941 79.38646 79.49489 78.83562 76.91205 79.85163 66.00709
## 57 58 59 60 61 62 63 64
## 82.73699 94.79784 89.57794 93.23100 92.02090 94.08436 95.15458 86.33584
## 65 66 67 68 69 70 71 72
## 86.33584 93.01414 87.35184 75.59352 93.23100 86.88667 81.77520 90.53973
## 73 74 75 76 77 78 79 80
## 90.84225 93.23100 79.65754 87.70858 93.89026 75.59352 84.71478 88.97289
## 81 82 83 84 85 86 87 88
## 81.77520 90.53973 80.81342 84.90888 79.13815 95.34867 93.83605 72.95647
## 89 90 91 92 93 94 95 96
## 96.41889 92.51752 81.61256 82.48868 80.04572 94.08436 94.44110 80.92185
## 97 98 99 100 101 102 103 104
## 91.60995 92.16078 82.98530 78.06793 82.32604 87.29762 83.45047 90.84225
## 105 106 107 108 109 110 111 112
## 89.46951 95.15458 93.47931 90.64816 85.51393 89.63216 94.44110 74.71740
## 113 114 115 116 117 118 119 120
## 71.63794 87.54593 76.50109 79.79741 80.75920 79.02972 90.53973 88.56194
## 121 122 123 124 125 126 127 128
## 75.53931 95.10036 88.97289 88.67037 95.34867 87.65436 91.91247 81.11594
## 129 130 131 132 133 134 135 136
## 86.94089 92.10657 80.75920 78.17636 84.52069 81.11594 89.68637 95.75962
## 137 138 139 140 141 142 143 144
## 83.45047 96.41889 94.19279 74.63173 82.07773 96.00793 84.46648 94.79784
## 145 146 147 148 149 150 151 152
## 77.81962 76.96626 86.69258 92.87426 89.46951 78.53310 87.95689 88.50772
## 153 154 155 156 157 158 159 160
## 73.61573 74.82583 70.01689 83.09373 86.08753 81.82942 73.25899 78.17636
## 161 162 163 164 165 166 167 168
## 91.44730 93.03691 89.93468 74.16656 87.35184 72.24299 83.89287 91.66416
## 169 170 171 172 173 174 175 176
## 83.45047 75.53931 82.68278 83.45047 82.79121 82.18616 92.10657 72.13455
## 177 178 179 180 181 182 183 184
## 95.45710 71.63794 88.11954 80.09994 84.05552 87.84846 87.90267 89.98890
## 185 186 187 188 189 190 191 192
## 97.02394 77.35445 86.69258 93.03691 89.98890 84.35804 77.51710 85.56814
## 193 194 195 196 197 198 199 200
## 92.51752 84.35804 94.13857 65.04530 91.85826 79.79741 85.62236 93.42509
## 201 202 203 204 205 206 207 208
## 73.06490 93.58774 87.65436 88.97289 92.26921 90.29142 80.26258 81.27859
## 209 210 211 212 213 214 215 216
## 91.14478 79.79741 88.56194 86.39005 76.80362 79.44067 94.68941 94.08436
## 217 218 219 220 221 222 223 224
## 93.89026 87.95689 86.69258 81.47268 86.33584 92.62595 90.59394 85.07152
## 225 226 227 228 229 230 231 232
## 87.24341 92.46331 82.43447 76.85783 72.65394 91.55573 78.47889 66.41804
## 233 234 235 236 237 238 239 240
## 96.06215 91.50152 88.42206 89.32963 89.52373 94.24700 74.71740 93.94448
## 241 242 243 244 245 246 247 248
## 69.96267 91.00490 91.00490 77.26879 90.29142 87.10353 93.17678 81.17016
## 249 250 251 252 253 254 255 256
## 86.88667 78.78141 92.98269 92.51752 90.34564 91.96669 91.19899 80.04572
## 257 258 259 260 261 262 263 264
## 73.01068 94.49531 81.00751 80.20837 78.23058 83.50469 89.93468 84.71478
## 265 266 267 268 269 270 271 272
## 82.89964 93.17678 93.78183 89.63216 78.64153 92.87426 93.72762 92.46331
## 273 274 275 276 277 278 279 280
## 81.17016 95.10036 83.39626 91.60995 91.91247 81.77520 73.75561 71.99468
## 281 282 283 284 285 286 287 288
## 85.31983 82.54290 93.83605 84.96309 75.95026 94.49531 84.41226 81.41847
## 289 290 291 292 293 294 295 296
## 87.10353 92.62595 93.23100 84.10974 92.92848 90.18299 91.30742 85.07152
## 297 298 299 300 301 302 303 304
## 80.56511 92.51752 73.31320 87.70858 91.30742 83.39626 91.91247 84.16395
## 305 306 307 308 309 310 311 312
## 80.04572 95.75962 88.61615 87.60015 87.95689 86.39005 87.87123 88.91868
## 313 314 315 316 317 318 319 320
## 72.35142 90.95068 94.08436 78.83562 95.70541 91.60995 83.45047 88.91868
## 321 322 323 324 325 326 327 328
## 97.62899 81.11594 80.45668 88.72458 88.36784 91.19899 82.02351 82.07773
## 329 330 331 332 333 334 335 336
## 89.98890 86.03331 86.49848 90.34564 82.79121 77.21457 91.60995 83.89287
## 337 338 339 340 341 342 343 344
## 98.28825 77.92805 90.48551 73.97247 78.83562 87.95689 94.49531 89.68637
## 345 346 347 348 349 350 351 352
## 72.95647 86.44427 92.62595 76.80362 93.64196 89.22120 81.52690 86.08753
## 353 354 355 356 357 358 359 360
## 78.83562 94.24700 97.32646 60.89562 84.00131 92.46331 91.50152 84.35804
## 361 362 363 364 365 366 367 368
## 80.45668 88.11954 88.67037 87.35184 87.40606 82.54290 94.44110 85.67657
## 369 370 371 372 373 374 375 376
## 80.40246 79.54910 75.95026 87.04932 88.81025 78.53310 77.76541 85.37405
## 377 378 379 380 381 382 383 384
## 83.45047 84.41226 88.36784 90.84225 89.52373 93.23100 79.90584 92.62595
## 385 386 387 388 389 390 391 392
## 88.25941 76.85783 87.65436 86.74679 95.10036 82.43447 82.43447 91.60995
## 393 394 395 396 397 398 399 400
## 74.98847 96.47310 91.60995 75.53931 86.03331 94.49531 80.20837 79.19236
## 401 402 403 404 405 406 407 408
## 87.70858 75.18257 96.06215 80.40246 89.63216 71.94046 88.36784 87.76280
## 409 410 411 412 413 414 415 416
## 77.57131 73.01068 79.85163 85.67657 81.88364 88.61615 82.73699 77.51710
## 417 418 419 420 421 422 423 424
## 90.53973 83.64457 92.62595 93.78183 79.30080 91.25321 92.10657 84.76900
## 425 426 427 428 429 430 431 432
## 83.39626 82.07773 79.96006 90.04311 79.13815 76.14435 93.17678 80.81342
## 433 434 435 436 437 438 439 440
## 93.72762 73.56151 85.56814 96.06215 70.73036 85.73079 91.25321 74.63173
## 441 442 443 444 445 446 447 448
## 90.89647 77.57131 87.95689 77.26879 82.13194 91.85826 86.63836 74.98847
## 449 450 451 452 453 454 455 456
## 89.22120 93.83605 88.91868 88.31363 83.94709 81.17016 87.40606 87.76280
## 457 458 459 460 461 462 463 464
## 92.57174 75.70195 93.23100 93.58774 94.38688 79.30080 95.86806 85.37405
## 465 466 467 468 469 470 471 472
## 79.08393 83.91564 90.23720 92.46331 88.56194 88.61615 90.04311 75.48509
## 473 474 475 476 477 478 479 480
## 88.15098 86.94089 83.34204 83.69878 96.06215 77.57131 95.10036 85.31983
## 481 482 483 484 485 486 487 488
## 89.93468 88.61615 88.36784 83.28783 94.74362 91.19899 90.23720 82.48868
## 489 490 491 492 493 494 495 496
## 80.15415 93.53352 84.90888 83.50469 91.19899 89.38385 94.08436 78.99827
## 497 498 499 500 501 502 503 504
## 77.26879 95.10036 97.07815 91.25321 76.50109 76.85783 91.25321 76.91205
## 505
## 94.79784
# Regress english against creativity and reading - no interaction term
model_1 <- lm(english ~ creativity + reading, data=talent)
# Summary statistics for model_1
summary(model_1)
##
## Call:
## lm(formula = english ~ creativity + reading, data = talent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.441 -4.757 0.658 5.916 35.104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.8956 1.7246 35.311 <2e-16 ***
## creativity 0.3025 0.1448 2.089 0.0372 *
## reading 0.6593 0.0611 10.790 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.12 on 502 degrees of freedom
## Multiple R-squared: 0.3138, Adjusted R-squared: 0.3111
## F-statistic: 114.8 on 2 and 502 DF, p-value: < 2.2e-16
# Regress english against creativity and reading - interaction term
model_2 <- lm(english ~ creativity * reading, data=talent)
# Summary statistics for model_2
summary(model_2)
##
## Call:
## lm(formula = english ~ creativity * reading, data = talent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.065 -4.672 0.628 5.853 30.488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.51193 3.95335 16.571 < 2e-16 ***
## creativity -0.27554 0.46845 -0.588 0.557
## reading 0.51868 0.12437 4.170 3.58e-05 ***
## creativity:reading 0.01646 0.01269 1.297 0.195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.12 on 501 degrees of freedom
## Multiple R-squared: 0.3161, Adjusted R-squared: 0.312
## F-statistic: 77.19 on 3 and 501 DF, p-value: < 2.2e-16
# Plot the relation between math and reading
plot(talent$math, talent$reading)
# Regress reading against math - no higher order terms
model_1 <- lm(reading ~ poly(math, 1), data=talent)
# Summary statistics for model_1
summary(model_1)
##
## Call:
## lm(formula = reading ~ poly(math, 1), data = talent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.274 -4.361 0.024 4.849 23.937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.7327 0.3233 104.33 <2e-16 ***
## poly(math, 1) 126.9301 7.2660 17.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.266 on 503 degrees of freedom
## Multiple R-squared: 0.3776, Adjusted R-squared: 0.3764
## F-statistic: 305.2 on 1 and 503 DF, p-value: < 2.2e-16
# Regress reading against math - 1 higher order term
model_2 <- lm(reading ~ poly(math, 2), data=talent)
# Summary statistics for model_2
summary(model_2)
##
## Call:
## lm(formula = reading ~ poly(math, 2), data = talent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.406 -4.554 0.229 4.714 26.745
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.7327 0.3227 104.517 <2e-16 ***
## poly(math, 2)1 126.9301 7.2528 17.501 <2e-16 ***
## poly(math, 2)2 -12.1921 7.2528 -1.681 0.0934 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.253 on 502 degrees of freedom
## Multiple R-squared: 0.3811, Adjusted R-squared: 0.3786
## F-statistic: 154.6 on 2 and 502 DF, p-value: < 2.2e-16
# Regress reading against math - 2 higher order terms
model_3 <- lm(reading ~ poly(math, 3), data=talent)
# Summary statistics for model_3
summary(model_3)
##
## Call:
## lm(formula = reading ~ poly(math, 3), data = talent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.541 -4.420 0.370 4.609 19.250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.7327 0.3203 105.311 < 2e-16 ***
## poly(math, 3)1 126.9301 7.1982 17.634 < 2e-16 ***
## poly(math, 3)2 -12.1921 7.1982 -1.694 0.09093 .
## poly(math, 3)3 -21.1712 7.1982 -2.941 0.00342 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.198 on 501 degrees of freedom
## Multiple R-squared: 0.3916, Adjusted R-squared: 0.388
## F-statistic: 107.5 on 3 and 501 DF, p-value: < 2.2e-16
Chapter 19 - Comparing Groups
Test independency between groups:
Test matched groups:
Analysis of variation (ANOVA):
Post-hoc with t-tests:
ANOVA and ANCOVA - suppose that a and b are factors while x is a continuous variable:
Example code (not run due to lack of dataset) includes:
# Ensure that the data are normally distributed
hist(talent$english)
# Perform the parametric test
t.test(talent$english ~ talent$gender)
##
## Welch Two Sample t-test
##
## data: talent$english by talent$gender
## t = -5.8389, df = 469.96, p-value = 9.815e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.295758 -4.118000
## sample estimates:
## mean in group 1 mean in group 2
## 82.78205 88.98893
# Perform the parametric test
# t.test(mydata$q1, mydata$q2, paired=TRUE)
# Perform the Wilcoxon signed rank test
# wilcox.test(mydata$q1, mydata$q2, paired=TRUE)
# Compute the means for subsets of the variable english
talent$region <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 5, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 2, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 5, 7, 7, 7)
talent$region <- factor(talent$region)
talent$fulltime <- factor(talent$fulltime)
by(talent$english, talent$region, mean, na.rm=TRUE)
## talent$region: 1
## [1] 85.0625
## --------------------------------------------------------
## talent$region: 2
## [1] 86.68421
## --------------------------------------------------------
## talent$region: 3
## [1] 85.75
## --------------------------------------------------------
## talent$region: 4
## [1] 84.95556
## --------------------------------------------------------
## talent$region: 5
## [1] 86.13592
## --------------------------------------------------------
## talent$region: 6
## [1] 88.13514
## --------------------------------------------------------
## talent$region: 7
## [1] 85.94444
## --------------------------------------------------------
## talent$region: 8
## [1] 86.34146
## --------------------------------------------------------
## talent$region: 9
## [1] 84
# Compute the variances for subsets of the variable english
anova_2 <- by(talent$english, talent$region, var, na.rm=TRUE)
anova_2
## talent$region: 1
## [1] 315.2863
## --------------------------------------------------------
## talent$region: 2
## [1] 125.3035
## --------------------------------------------------------
## talent$region: 3
## [1] 175.792
## --------------------------------------------------------
## talent$region: 4
## [1] 107.9071
## --------------------------------------------------------
## talent$region: 5
## [1] 115.0598
## --------------------------------------------------------
## talent$region: 6
## [1] 151.1201
## --------------------------------------------------------
## talent$region: 7
## [1] 141.3497
## --------------------------------------------------------
## talent$region: 8
## [1] 136.7805
## --------------------------------------------------------
## talent$region: 9
## [1] 288
# Test for homogeneity of variance across the groups
car::leveneTest(talent$english, talent$region)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 8 0.2838 0.9712
## 496
# Create the ANOVA model: model_anova
model_anova <- aov(talent$reading ~ talent$fulltime) # aov(talent$reading ~ talent$fulltime, data = talent)
# Print the summary statistics for the ANOVA model
summary(model_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## talent$fulltime 4 6769 1692.4 23.57 <2e-16 ***
## Residuals 500 35897 71.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model_anova)
## Analysis of Variance Table
##
## Response: talent$reading
## Df Sum Sq Mean Sq F value Pr(>F)
## talent$fulltime 4 6769 1692.36 23.572 < 2.2e-16 ***
## Residuals 500 35897 71.79
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Obtain diagnostic plots for the ANOVA model
plot(model_anova)
# Perform the Kruskal-Wallis test
kt <- kruskal.test(talent$english ~ talent$region)
kt
##
## Kruskal-Wallis rank sum test
##
## data: talent$english by talent$region
## Kruskal-Wallis chi-squared = 3.4517, df = 8, p-value = 0.9029
# Perform pairwise comparisons between the fulltime levels for english: pairwise t-test
pairwise.t.test(talent$english, talent$fulltime)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: talent$english and talent$fulltime
##
## 1 2 3 4
## 2 1.0000 - - -
## 3 0.0020 0.0790 - -
## 4 0.0010 0.1057 1.0000 -
## 5 4.9e-08 0.0023 1.0000 0.6824
##
## P value adjustment method: holm
# Regress english against fulltime using the analysis of variance approach and assign the name model_t
model_t <- aov(talent$english ~ talent$fulltime)
# Perform pairwise comparisons between the fulltime levels for english: tukeyHSD test
# Print the result
THSD <- TukeyHSD(model_t, "talent$fulltime")
THSD
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = talent$english ~ talent$fulltime)
##
## $`talent$fulltime`
## diff lwr upr p adj
## 2-1 -1.3915563 -6.421534 3.6384214 0.9425096
## 3-1 -7.0765563 -12.322069 -1.8310438 0.0022762
## 4-1 -5.9495460 -10.132806 -1.7662858 0.0010580
## 5-1 -8.0463576 -11.746257 -4.3464584 0.0000000
## 3-2 -5.6850000 -11.940144 0.5701437 0.0949091
## 4-2 -4.5579897 -9.953459 0.8374794 0.1424661
## 5-2 -6.6548013 -11.684779 -1.6248236 0.0029616
## 4-3 1.1270103 -4.469936 6.7239570 0.9817595
## 5-3 -0.9698013 -6.215314 4.2757112 0.9867490
## 5-4 -2.0968116 -6.280072 2.0864485 0.6457656
# Plot the result of the tukeyHSD test
plot(THSD)
# Perform the Pairwise Wilcoxon Rank Sum Test
pairwise.wilcox.test(talent$english, talent$fulltime)
##
## Pairwise comparisons using Wilcoxon rank sum test
##
## data: talent$english and talent$fulltime
##
## 1 2 3 4
## 2 1.00000 - - -
## 3 7.1e-06 0.00047 - -
## 4 7.1e-06 0.00248 1.00000 -
## 5 4.0e-09 0.00011 1.00000 1.00000
##
## P value adjustment method: holm
Chapter 20 - High Quality Output
High-Quality Output:
Example code includes:
# Load the required package xtable
library(xtable)
# The linear model myM1 is created.
myM1 <- lm(q4 ~ q1 + q2 + q3, data = mydata)
# Print an xtable of the linear model 'myM1' and print it as a LaTeX table.
print(xtable(myM1), type="LaTeX")
# Make sure to load the required package
library(texreg)
# Create the table you see on the right from the linear model `myM1` and call the file "myM1.doc".
htmlreg(myM1, single.row=TRUE, file="myM1.doc")
library(texreg)
# Two linear models myM1 and myM2 are created.
myM1 <- lm(q4 ~ q1 + q2 + q3, data = mydata)
myM2 <- lm(q4 ~ q1, data = mydata)
# Create a HTML table of the linear models 'myM1` and `myM2`.
htmlreg(list(myM1, myM2))
# Create a LaTeX table of the linear models 'myM1` and `myM2`.
texreg(list(myM1, myM2))
Chapter 21 - Ways to Run R
Chapter 1 - Data Wrangling
Gapminder Dataset - tracks socioeconomic indicators for countries over time:
Filter verb (dplyr) - looking at a subset of observations:
Arrange verb (dplyr) - sorts the observations in ascending or descending order:
Mutate verb (dplyr) - changing ot adding variables:
Example code includes:
# Load the gapminder package
library(gapminder)
# Load the dplyr package
library(dplyr)
# Look at the gapminder dataset
gapminder
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414
## # ... with 1,694 more rows
# Filter the gapminder dataset for the year 1957
gapminder %>% filter(year == 1957)
## # A tibble: 142 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1957 30.332 9240934 820.8530
## 2 Albania Europe 1957 59.280 1476505 1942.2842
## 3 Algeria Africa 1957 45.685 10270856 3013.9760
## 4 Angola Africa 1957 31.999 4561361 3827.9405
## 5 Argentina Americas 1957 64.399 19610538 6856.8562
## 6 Australia Oceania 1957 70.330 9712569 10949.6496
## 7 Austria Europe 1957 67.480 6965860 8842.5980
## 8 Bahrain Asia 1957 53.832 138655 11635.7995
## 9 Bangladesh Asia 1957 39.348 51365468 661.6375
## 10 Belgium Europe 1957 69.240 8989111 9714.9606
## # ... with 132 more rows
# Filter for China in 2002
gapminder %>% filter(year == 2002, country == "China")
## # A tibble: 1 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 2002 72.028 1280400000 3119.281
# Sort in ascending order of lifeExp
gapminder %>% arrange(lifeExp)
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Rwanda Africa 1992 23.599 7290203 737.0686
## 2 Afghanistan Asia 1952 28.801 8425333 779.4453
## 3 Gambia Africa 1952 30.000 284320 485.2307
## 4 Angola Africa 1952 30.015 4232095 3520.6103
## 5 Sierra Leone Africa 1952 30.331 2143249 879.7877
## 6 Afghanistan Asia 1957 30.332 9240934 820.8530
## 7 Cambodia Asia 1977 31.220 6978607 524.9722
## 8 Mozambique Africa 1952 31.286 6446316 468.5260
## 9 Sierra Leone Africa 1957 31.570 2295678 1004.4844
## 10 Burkina Faso Africa 1952 31.975 4469979 543.2552
## # ... with 1,694 more rows
# Sort in descending order of lifeExp
gapminder %>% arrange(desc(lifeExp))
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Japan Asia 2007 82.603 127467972 31656.07
## 2 Hong Kong, China Asia 2007 82.208 6980412 39724.98
## 3 Japan Asia 2002 82.000 127065841 28604.59
## 4 Iceland Europe 2007 81.757 301931 36180.79
## 5 Switzerland Europe 2007 81.701 7554661 37506.42
## 6 Hong Kong, China Asia 2002 81.495 6762476 30209.02
## 7 Australia Oceania 2007 81.235 20434176 34435.37
## 8 Spain Europe 2007 80.941 40448191 28821.06
## 9 Sweden Europe 2007 80.884 9031088 33859.75
## 10 Israel Asia 2007 80.745 6426679 25523.28
## # ... with 1,694 more rows
# Filter for the year 1957, then arrange in descending order of population
gapminder %>% filter(year == 1957) %>% arrange(desc(pop))
## # A tibble: 142 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 1957 50.54896 637408000 575.9870
## 2 India Asia 1957 40.24900 409000000 590.0620
## 3 United States Americas 1957 69.49000 171984000 14847.1271
## 4 Japan Asia 1957 65.50000 91563009 4317.6944
## 5 Indonesia Asia 1957 39.91800 90124000 858.9003
## 6 Germany Europe 1957 69.10000 71019069 10187.8267
## 7 Brazil Americas 1957 53.28500 65551171 2487.3660
## 8 United Kingdom Europe 1957 70.42000 51430000 11283.1779
## 9 Bangladesh Asia 1957 39.34800 51365468 661.6375
## 10 Italy Europe 1957 67.81000 49182000 6248.6562
## # ... with 132 more rows
# Use mutate to change lifeExp to be in months
gapminder %>% mutate(lifeExp = lifeExp * 12)
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 345.612 8425333 779.4453
## 2 Afghanistan Asia 1957 363.984 9240934 820.8530
## 3 Afghanistan Asia 1962 383.964 10267083 853.1007
## 4 Afghanistan Asia 1967 408.240 11537966 836.1971
## 5 Afghanistan Asia 1972 433.056 13079460 739.9811
## 6 Afghanistan Asia 1977 461.256 14880372 786.1134
## 7 Afghanistan Asia 1982 478.248 12881816 978.0114
## 8 Afghanistan Asia 1987 489.864 13867957 852.3959
## 9 Afghanistan Asia 1992 500.088 16317921 649.3414
## 10 Afghanistan Asia 1997 501.156 22227415 635.3414
## # ... with 1,694 more rows
# Use mutate to create a new column called lifeExpMonths
gapminder %>% mutate(lifeExpMonths = lifeExp * 12)
## # A tibble: 1,704 x 7
## country continent year lifeExp pop gdpPercap lifeExpMonths
## <fctr> <fctr> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453 345.612
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530 363.984
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007 383.964
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971 408.240
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811 433.056
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134 461.256
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114 478.248
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959 489.864
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414 500.088
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414 501.156
## # ... with 1,694 more rows
# Filter, mutate, and arrange the gapminder dataset
gapminder %>%
filter(year == 2007) %>%
mutate(lifeExpMonths = 12 * lifeExp) %>%
arrange(desc(lifeExpMonths))
## # A tibble: 142 x 7
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Japan Asia 2007 82.603 127467972 31656.07
## 2 Hong Kong, China Asia 2007 82.208 6980412 39724.98
## 3 Iceland Europe 2007 81.757 301931 36180.79
## 4 Switzerland Europe 2007 81.701 7554661 37506.42
## 5 Australia Oceania 2007 81.235 20434176 34435.37
## 6 Spain Europe 2007 80.941 40448191 28821.06
## 7 Sweden Europe 2007 80.884 9031088 33859.75
## 8 Israel Asia 2007 80.745 6426679 25523.28
## 9 France Europe 2007 80.657 61083916 30470.02
## 10 Canada Americas 2007 80.653 33390141 36319.24
## # ... with 132 more rows, and 1 more variables: lifeExpMonths <dbl>
Chapter 2 - Data Visualization
Visualizing with ggplot2 - mainly, subsets of the gapminder data, such as just the 2007 data:
Log scales - managing the “several orders of magnitude” issue in many datasets:
Additional aesthetics - such as looking at continent as the colors or life expectancy as the size:
Faceting - creating sub-plots based on a categorical/factor variable:
Example code includes:
# Load the ggplot2 package as well
library(ggplot2)
library(gapminder)
library(dplyr)
# Create gapminder_1952
gapminder_1952 <- gapminder %>% filter(year == 1952)
# Change to put pop on the x-axis and gdpPercap on the y-axis
ggplot(gapminder_1952, aes(x = pop, y = gdpPercap)) +
geom_point()
# Create a scatter plot with pop on the x-axis and lifeExp on the y-axis
ggplot(gapminder_1952, aes(x=pop, y=lifeExp)) + geom_point()
# Change this plot to put the x-axis on a log scale
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
geom_point() + scale_x_log10()
# Scatter plot comparing pop and gdpPerCap, with both axes on a log scale
ggplot(gapminder_1952, aes(x=pop, y=gdpPercap)) + geom_point() + scale_x_log10() + scale_y_log10()
# Scatter plot comparing pop and lifeExp, with color representing continent
ggplot(gapminder_1952, aes(x=pop, y=lifeExp, color=continent)) + geom_point() + scale_x_log10()
# Add the size aesthetic to represent a country's gdpPercap
ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent, size=gdpPercap)) +
geom_point() +
scale_x_log10()
# Scatter plot comparing pop and lifeExp, faceted by continent
ggplot(gapminder_1952, aes(x=pop, y=lifeExp)) + geom_point() + scale_x_log10() + facet_wrap(~ continent)
# Scatter plot comparing gdpPercap and lifeExp, with color representing continent
# and size representing population, faceted by year
ggplot(gapminder, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) +
geom_point() + scale_x_log10() + facet_wrap(~ year)
Chapter 3 - Grouping and Summarizing
Summarize verb (dplyr) - turns data from many rows in to a single summary row:
Group_by verb (dplyr) - typically precedes the summarize() call and asks for summaries by the various groups:
Visualizing summarized data - taking outputs of a group_by() and summarize() as inputs to a ggplot():
Example code includes:
# Summarize to find the median life expectancy
gapminder %>% summarize(medianLifeExp = median(lifeExp))
## # A tibble: 1 x 1
## medianLifeExp
## <dbl>
## 1 60.7125
# Filter for 1957 then summarize the median life expectancy
gapminder %>% filter(year == 1957) %>% summarize(medianLifeExp = median(lifeExp))
## # A tibble: 1 x 1
## medianLifeExp
## <dbl>
## 1 48.3605
# Filter for 1957 then summarize the median life expectancy and the maximum GDP per capita
gapminder %>% filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## # A tibble: 1 x 2
## medianLifeExp maxGdpPercap
## <dbl> <dbl>
## 1 48.3605 113523.1
# Find median life expectancy and maximum GDP per capita in each year
gapminder %>% group_by(year) %>% summarize(medianLifeExp = median(lifeExp), maxGdpPercap=max(gdpPercap))
## # A tibble: 12 x 3
## year medianLifeExp maxGdpPercap
## <int> <dbl> <dbl>
## 1 1952 45.1355 108382.35
## 2 1957 48.3605 113523.13
## 3 1962 50.8810 95458.11
## 4 1967 53.8250 80894.88
## 5 1972 56.5300 109347.87
## 6 1977 59.6720 59265.48
## 7 1982 62.4415 33693.18
## 8 1987 65.8340 31540.97
## 9 1992 67.7030 34932.92
## 10 1997 69.3940 41283.16
## 11 2002 70.8255 44683.98
## 12 2007 71.9355 49357.19
# Find median life expectancy and maximum GDP per capita in each continent in 1957
gapminder %>% filter(year == 1957) %>% group_by(continent) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap=max(gdpPercap))
## # A tibble: 5 x 3
## continent medianLifeExp maxGdpPercap
## <fctr> <dbl> <dbl>
## 1 Africa 40.5925 5487.104
## 2 Americas 56.0740 14847.127
## 3 Asia 48.2840 113523.133
## 4 Europe 67.6500 17909.490
## 5 Oceania 70.2950 12247.395
# Find median life expectancy and maximum GDP per capita in each year/continent combination
gapminder %>% group_by(continent, year) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap=max(gdpPercap))
## # A tibble: 60 x 4
## # Groups: continent [?]
## continent year medianLifeExp maxGdpPercap
## <fctr> <int> <dbl> <dbl>
## 1 Africa 1952 38.8330 4725.296
## 2 Africa 1957 40.5925 5487.104
## 3 Africa 1962 42.6305 6757.031
## 4 Africa 1967 44.6985 18772.752
## 5 Africa 1972 47.0315 21011.497
## 6 Africa 1977 49.2725 21951.212
## 7 Africa 1982 50.7560 17364.275
## 8 Africa 1987 51.6395 11864.408
## 9 Africa 1992 52.4290 13522.158
## 10 Africa 1997 52.7590 14722.842
## # ... with 50 more rows
by_year <- gapminder %>%
group_by(year) %>%
summarize(medianLifeExp = median(lifeExp),
maxGdpPercap = max(gdpPercap))
# Create a scatter plot showing the change in medianLifeExp over time
ggplot(by_year, aes(x=year, y=medianLifeExp)) + geom_point() + expand_limits(y=0)
# Summarize medianGdpPercap within each continent within each year: by_year_continent
by_year_continent <- gapminder %>% group_by(continent, year) %>%
summarize(medianGdpPercap = median(gdpPercap))
# Plot the change in medianGdpPercap in each continent over time
ggplot(by_year_continent, aes(x=year, y=medianGdpPercap, color=continent)) +
geom_point() + expand_limits(y=0)
# Summarize the median GDP and median life expectancy per continent in 2007
by_continent_2007 <- gapminder %>% filter(year == 2007) %>% group_by(continent) %>%
summarize(medianLifeExp = median(lifeExp), medianGdpPercap=median(gdpPercap))
# Use a scatter plot to compare the median GDP and median life expectancy
ggplot(by_continent_2007, aes(x=medianGdpPercap, y=medianLifeExp, color=continent)) + geom_point()
Chapter 4 - Types of Visualizations
Line plots - work better for looking at trends over time:
Bar plots - work better for comparing statistics by groups:
Histograms - work better to describe distribution of a 1D numeric variable:
Box plots - work better to compare distributions of numerical variables across categories:
Conclusion:
Example code includes:
# Summarize the median gdpPercap by year, then save it as by_year
by_year <- gapminder %>% group_by(year) %>% summarize(medianGdpPercap = median(gdpPercap))
# Create a line plot showing the change in medianGdpPercap over time
ggplot(by_year, aes(x=year, y=medianGdpPercap)) + geom_line() + expand_limits(y=0)
# Summarize the median gdpPercap by year & continent, save as by_year_continent
by_year_continent <- gapminder %>% group_by(year, continent) %>%
summarize(medianGdpPercap = median(gdpPercap))
# Create a line plot showing the change in medianGdpPercap by continent over time
ggplot(by_year_continent, aes(x=year, y=medianGdpPercap, color=continent)) +
geom_line() + expand_limits(y=0)
# Summarize the median gdpPercap by year and continent in 1952
by_continent <- gapminder %>% filter(year == 1952) %>% group_by(continent) %>%
summarize(medianGdpPercap = median(gdpPercap))
# Create a bar plot showing medianGdp by continent
ggplot(by_continent, aes(x=continent, y=medianGdpPercap)) + geom_col()
# Filter for observations in the Oceania continent in 1952
oceania_1952 <- gapminder %>% filter(year == 1952, continent == "Oceania")
# Create a bar plot of gdpPerCap by country
ggplot(oceania_1952, aes(x=country, y=gdpPercap)) + geom_col()
gapminder_1952 <- gapminder %>%
filter(year == 1952)
# Create a histogram of population (pop)
ggplot(gapminder_1952, aes(x=pop)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create a histogram of population (pop), with x on a log scale
ggplot(gapminder_1952, aes(x=pop)) + geom_histogram() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gapminder_1952 <- gapminder %>%
filter(year == 1952)
# Create a boxplot comparing gdpPercap among continents
ggplot(gapminder_1952, aes(x=continent, y=gdpPercap)) + geom_boxplot() + scale_y_log10()
# Add a title to this graph: "Comparing GDP per capita across continents"
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
geom_boxplot() +
scale_y_log10() +
ggtitle("Comparing GDP per capita across continents")
Chapter 1 - Introduction and Shiny Basics